From f3a4e18d0236a1bf2819c981f40737cf78500599 Mon Sep 17 00:00:00 2001 From: Debadri Basak Date: Mon, 3 Nov 2025 13:11:00 +0000 Subject: [PATCH 001/313] Adding implementation for missing origin statsitics on LifetimeSafetyAnalysis --- .../Analyses/LifetimeSafety/LifetimeSafety.h | 21 +++++++++++++++++++ .../Analyses/LifetimeSafety/Origins.h | 6 ++++++ .../LifetimeSafety/LifetimeSafety.cpp | 5 +++++ clang/lib/Analysis/LifetimeSafety/Origins.cpp | 16 ++++++++++++++ clang/lib/Sema/AnalysisBasedWarnings.cpp | 4 +++- 5 files changed, 51 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index 91ffbb169f947..4952d84a80369 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -23,7 +23,11 @@ #include "clang/Analysis/Analyses/LifetimeSafety/Facts.h" #include "clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h" #include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" #include "clang/Analysis/AnalysisDeclContext.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/raw_ostream.h" +#include namespace clang::lifetimes { @@ -60,6 +64,9 @@ struct LifetimeFactory { /// Running the lifetime safety analysis and querying its results. It /// encapsulates the various dataflow analyses. class LifetimeSafetyAnalysis { +private: + static llvm::StringMap MissingOriginCount; + public: LifetimeSafetyAnalysis(AnalysisDeclContext &AC, LifetimeSafetyReporter *Reporter); @@ -73,6 +80,20 @@ class LifetimeSafetyAnalysis { LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; } FactManager &getFactManager() { return FactMgr; } + static void PrintStats(llvm::raw_ostream &OS) { + llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats " + "(expression_type : count) :\n"; + for (const auto &[expr, count] : LifetimeSafetyAnalysis::count) { + OS << expr << " : " << count << '\n'; + } + } + + static void UpdateMissingOriginCount(const OriginManager &OM) { + for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) { + LifetimeSafetyAnalysis::count[std::string(expr)] += missing_origin_count; + } + } + private: AnalysisDeclContext &AC; LifetimeSafetyReporter *Reporter; diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h index ba138b078b379..231cc60b7e097 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h @@ -16,7 +16,10 @@ #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" +#include "clang/AST/TypeBase.h" #include "clang/Analysis/Analyses/LifetimeSafety/Utils.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/raw_ostream.h" namespace clang::lifetimes::internal { @@ -76,6 +79,8 @@ class OriginManager { void dump(OriginID OID, llvm::raw_ostream &OS) const; + const llvm::StringMap getMissingOrigins() const; + private: OriginID getNextOriginID() { return NextOriginID++; } @@ -85,6 +90,7 @@ class OriginManager { llvm::SmallVector AllOrigins; llvm::DenseMap DeclToOriginID; llvm::DenseMap ExprToOriginID; + llvm::StringMap ExprTypeToMissingOriginCount; }; } // namespace clang::lifetimes::internal diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp index 00c7ed90503e7..a76fdd2535d97 100644 --- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp @@ -23,14 +23,18 @@ #include "clang/Analysis/AnalysisDeclContext.h" #include "clang/Analysis/CFG.h" #include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/StringMap.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TimeProfiler.h" +#include "llvm/Support/raw_ostream.h" #include namespace clang::lifetimes { namespace internal { +llvm::StringMap LifetimeSafetyAnalysis::MissingOriginCount; + LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC, LifetimeSafetyReporter *Reporter) : AC(AC), Reporter(Reporter) {} @@ -66,6 +70,7 @@ void LifetimeSafetyAnalysis::run() { LiveOrigins->dump(llvm::dbgs(), FactMgr.getTestPoints())); runLifetimeChecker(*LoanPropagation, *LiveOrigins, FactMgr, AC, Reporter); + UpdateMissingOriginCount(FactMgr.getOriginMgr()); } } // namespace internal diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp index ea51a75324e06..abe067a829cb7 100644 --- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" +#include "clang/AST/TypeBase.h" +#include "llvm/ADT/StringMap.h" namespace clang::lifetimes::internal { @@ -22,6 +24,10 @@ void OriginManager::dump(OriginID OID, llvm::raw_ostream &OS) const { OS << ")"; } +const llvm::StringMap OriginManager::getMissingOrigins() const { + return ExprTypeToMissingOriginCount; +} + Origin &OriginManager::addOrigin(OriginID ID, const clang::ValueDecl &D) { AllOrigins.emplace_back(ID, &D); return AllOrigins.back(); @@ -37,6 +43,16 @@ OriginID OriginManager::get(const Expr &E) { auto It = ExprToOriginID.find(&E); if (It != ExprToOriginID.end()) return It->second; + + // if the expression has no specific origin, increment the missing origin + // counter. + const QualType ExprType = E.getType(); + auto CountIt = ExprTypeToMissingOriginCount.find(ExprType.getAsString()); + if (CountIt == ExprTypeToMissingOriginCount.end()) { + ExprTypeToMissingOriginCount[ExprType.getAsString()] = 1; + } else { + CountIt->second++; + } // If the expression itself has no specific origin, and it's a reference // to a declaration, its origin is that of the declaration it refers to. // For pointer types, where we don't pre-emptively create an origin for the diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 140b709dbb651..009994e189220 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -30,6 +30,7 @@ #include "clang/Analysis/Analyses/CalledOnceCheck.h" #include "clang/Analysis/Analyses/Consumed.h" #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" #include "clang/Analysis/Analyses/ReachableCode.h" #include "clang/Analysis/Analyses/ThreadSafety.h" #include "clang/Analysis/Analyses/UninitializedValues.h" @@ -53,6 +54,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -3132,8 +3134,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( } void clang::sema::AnalysisBasedWarnings::PrintStats() const { + clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs()); llvm::errs() << "\n*** Analysis Based Warnings Stats:\n"; - unsigned NumCFGsBuilt = NumFunctionsAnalyzed - NumFunctionsWithBadCFGs; unsigned AvgCFGBlocksPerFunction = !NumCFGsBuilt ? 0 : NumCFGBlocks/NumCFGsBuilt; From 2ea1edbeea095b6baec25233c5076de6f5463e8b Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Mon, 3 Nov 2025 13:22:03 +0000 Subject: [PATCH 002/313] [gn build] Port de2797c888e0 --- .../gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn | 1 + .../gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn index f280f695cd3ab..2f84999621e1b 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn @@ -32,6 +32,7 @@ static_library("bugprone") { "CopyConstructorInitCheck.cpp", "CrtpConstructorAccessibilityCheck.cpp", "DanglingHandleCheck.cpp", + "DefaultOperatorNewOnOveralignedTypeCheck.cpp", "DerivedMethodShadowingBaseMethodCheck.cpp", "DynamicStaticInitializersCheck.cpp", "EasilySwappableParametersCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn index 3ad0a83a8fb23..ec642b6afad66 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn @@ -16,7 +16,6 @@ static_library("cert") { ] sources = [ "CERTTidyModule.cpp", - "DefaultOperatorNewAlignmentCheck.cpp", "DontModifyStdNamespaceCheck.cpp", "FloatLoopCounter.cpp", "LimitedRandomnessCheck.cpp", From fa85d2da89acec96c6c31f3d0e2dd009b6286054 Mon Sep 17 00:00:00 2001 From: Debadri Basak Date: Mon, 3 Nov 2025 13:23:42 +0000 Subject: [PATCH 003/313] Minor refactoring of the static functions --- .../Analyses/LifetimeSafety/LifetimeSafety.h | 14 ++------------ .../lib/Analysis/LifetimeSafety/LifetimeSafety.cpp | 13 +++++++++++++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index 4952d84a80369..7490df90a3282 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -80,19 +80,9 @@ class LifetimeSafetyAnalysis { LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; } FactManager &getFactManager() { return FactMgr; } - static void PrintStats(llvm::raw_ostream &OS) { - llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats " - "(expression_type : count) :\n"; - for (const auto &[expr, count] : LifetimeSafetyAnalysis::count) { - OS << expr << " : " << count << '\n'; - } - } + static void PrintStats(llvm::raw_ostream &OS); - static void UpdateMissingOriginCount(const OriginManager &OM) { - for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) { - LifetimeSafetyAnalysis::count[std::string(expr)] += missing_origin_count; - } - } + static void UpdateMissingOriginCount(const OriginManager &OM); private: AnalysisDeclContext &AC; diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp index a76fdd2535d97..828c08d1cbeed 100644 --- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp @@ -39,6 +39,19 @@ LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC, LifetimeSafetyReporter *Reporter) : AC(AC), Reporter(Reporter) {} +void LifetimeSafetyAnalysis::PrintStats(llvm::raw_ostream &OS) { + llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats " + "(expression_type : count) :\n"; + for (const auto &[expr, count] : LifetimeSafetyAnalysis::MissingOriginCount) { + OS << expr << " : " << count << '\n'; + } + } + +void LifetimeSafetyAnalysis::UpdateMissingOriginCount(const OriginManager &OM) { + for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) { + LifetimeSafetyAnalysis::MissingOriginCount[std::string(expr)] += missing_origin_count; + } + } void LifetimeSafetyAnalysis::run() { llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis"); From ecc70fdbe36b0b8994a57f8c213c2f2ce651563d Mon Sep 17 00:00:00 2001 From: Koakuma Date: Mon, 3 Nov 2025 20:31:39 +0700 Subject: [PATCH 004/313] [llvm-lit] Add `sparc64` to the list of BE triple (#166113) Linux uses `sparc64` triple name on 64-bit SPARCs. This should fix the test failure in discriminated-union.ll. --- llvm/test/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 11a5a5785a6ec..974af4b571503 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -474,7 +474,7 @@ def enable_ptxas(ptxas_executable): config.available_features.add("host-byteorder-" + sys.byteorder + "-endian") if config.target_triple: if re.match( - r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|s390x|s390|tce|thumbeb)-.*", + r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|sparc64|s390x|s390|tce|thumbeb)-.*", config.target_triple, ): config.available_features.add("target-byteorder-big-endian") From 8bd7fc7211782e7b2ba76070ee6af46f4913a185 Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Mon, 3 Nov 2025 21:36:10 +0800 Subject: [PATCH 005/313] [ExprMutation] fix false postives on pointer-to-member operator (#166069) Fixed: #161913 --------- Co-authored-by: Baranov Victor --- clang-tools-extra/docs/ReleaseNotes.rst | 3 ++- clang/lib/Analysis/ExprMutationAnalyzer.cpp | 13 ++++++++----- .../Analysis/ExprMutationAnalyzerTest.cpp | 15 +++++++++++++++ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 35a64395f04e9..26f00e9a8a294 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -379,7 +379,8 @@ Changes in existing checks ` check to avoid false positives when pointers is transferred to non-const references and avoid false positives of function pointer and fix false - positives on return of non-const pointer. + positives on return of non-const pointer and fix false positives on + pointer-to-member operator. - Improved :doc:`misc-header-include-cycle ` check performance. diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp index 75b17c545bb78..54c30c05c3e19 100644 --- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp +++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp @@ -746,11 +746,14 @@ ExprMutationAnalyzer::Analyzer::findPointeeMemberMutation(const Expr *Exp) { Stm, Context)); if (MemberCallExpr) return MemberCallExpr; - const auto Matches = - match(stmt(forEachDescendant( - memberExpr(hasObjectExpression(canResolveToExprPointee(Exp))) - .bind(NodeID::value))), - Stm, Context); + const auto Matches = match( + stmt(forEachDescendant( + expr(anyOf(memberExpr( + hasObjectExpression(canResolveToExprPointee(Exp))), + binaryOperator(hasOperatorName("->*"), + hasLHS(canResolveToExprPointee(Exp))))) + .bind(NodeID::value))), + Stm, Context); return findExprMutation(Matches); } diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp index ef229606de0f0..8fc9a66dbda7e 100644 --- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp +++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp @@ -2076,4 +2076,19 @@ TEST(ExprMutationAnalyzerTest, PointeeMutatedByReturn) { } } +TEST(ExprMutationAnalyzerTest, PointeeMutatedByPointerToMemberOperator) { + // GH161913 + const std::string Code = R"( + struct S { int i; }; + void f(S s) { + S *x = &s; + (x->*(&S::i))++; + } + )"; + auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"}); + auto Results = + match(withEnclosingCompound(declRefTo("x")), AST->getASTContext()); + EXPECT_TRUE(isPointeeMutated(Results, AST.get())); +} + } // namespace clang From 795fa9ea00a18b2ccd779efb17ce805a73d1ce7a Mon Sep 17 00:00:00 2001 From: Shikhar Jain Date: Mon, 3 Nov 2025 19:06:37 +0530 Subject: [PATCH 006/313] Bound ISL operations during pre-vectorization (#165204) Bound ISL operations during pre-vectorization to prevent indefinite compilation. The MaxOpGuard previously used for schedule computation is now extended to also guard pre-vectorization optimizations. This patch includes a reduced test case derived from the original bug report. --------- Co-authored-by: Michael Kruse --- polly/lib/Transform/ScheduleOptimizer.cpp | 53 +++++++++++++------ polly/lib/Transform/ScheduleTreeTransform.cpp | 5 ++ .../prevectorization_islbound.ll | 37 +++++++++++++ 3 files changed, 78 insertions(+), 17 deletions(-) create mode 100644 polly/test/ScheduleOptimizer/prevectorization_islbound.ll diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index 070700a64a168..0888ebd7a9362 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -237,6 +237,7 @@ struct OptimizerAdditionalInfoTy { bool Postopts; bool Prevect; bool &DepsChanged; + IslMaxOperationsGuard &MaxOpGuard; }; class ScheduleTreeOptimizer final { @@ -381,6 +382,8 @@ class ScheduleTreeOptimizer final { isl::schedule_node ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, int VectorWidth) { + if (Node.is_null()) + return {}; assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); Node = Node.child(0).child(0); isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation(); @@ -391,6 +394,8 @@ ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1); Node = Node.parent().parent(); isl::union_set Options = IsolateOption.unite(AtomicOption); + if (Node.is_null()) + return {}; isl::schedule_node_band Result = Node.as().set_ast_build_options(Options); return Result; @@ -411,9 +416,13 @@ struct InsertSimdMarkers final : ScheduleNodeRewriter { isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) { + if (Node.is_null()) + return {}; assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); + if (Space.is_null()) + return {}; unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set)); assert(DimToVectorize < ScheduleDimensions); @@ -439,12 +448,15 @@ isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( // Sink the inner loop into the smallest possible statements to make them // represent a single vector instruction if possible. Node = isl::manage(isl_schedule_node_band_sink(Node.release())); + if (Node.is_null()) + return {}; // Add SIMD markers to those vector statements. InsertSimdMarkers SimdMarkerInserter; Node = SimdMarkerInserter.visit(Node); - PrevectOpts++; + if (!Node.is_null()) + PrevectOpts++; return Node.parent(); } @@ -535,6 +547,8 @@ ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) { isl::schedule_node ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) { auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); + if (Space.is_null()) + return {}; int Dims = unsignedFromIslSize(Space.dim(isl::dim::set)); for (int i = Dims - 1; i >= 0; i--) @@ -572,9 +586,14 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg, Node = applyTileBandOpt(Node); if (OAI->Prevect) { + IslQuotaScope MaxScope = OAI->MaxOpGuard.enter(); + // FIXME: Prevectorization requirements are different from those checked by // isTileableBandNode. Node = applyPrevectBandOpt(Node); + + if (OAI->MaxOpGuard.hasQuotaExceeded() || Node.is_null()) + return (isl::schedule_node()).release(); } return Node.release(); @@ -771,6 +790,10 @@ static void runIslScheduleOptimizer( return; } + isl_ctx *Ctx = S.getIslCtx().get(); + IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut, + /*AutoEnter=*/false); + // Apply ISL's algorithm only if not overridden by the user. Note that // post-rescheduling optimizations (tiling, pattern-based, prevectorization) // rely on the coincidence/permutable annotations on schedule tree bands that @@ -853,8 +876,6 @@ static void runIslScheduleOptimizer( IslOuterCoincidence = 0; } - isl_ctx *Ctx = S.getIslCtx().get(); - isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence); isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); @@ -870,28 +891,20 @@ static void runIslScheduleOptimizer( SC = SC.set_coincidence(Validity); { - IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut); + IslQuotaScope MaxOpScope = MaxOpGuard.enter(); Schedule = SC.compute_schedule(); - - if (MaxOpGuard.hasQuotaExceeded()) - POLLY_DEBUG( - dbgs() << "Schedule optimizer calculation exceeds ISL quota\n"); } isl_options_set_on_error(Ctx, OnErrorStatus); - ScopsRescheduled++; + if (!Schedule.is_null()) + ScopsRescheduled++; POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling")); } walkScheduleTreeForStatistics(Schedule, 1); - // In cases the scheduler is not able to optimize the code, we just do not - // touch the schedule. - if (Schedule.is_null()) - return; - - if (GreedyFusion) { + if (GreedyFusion && !Schedule.is_null()) { isl::union_map Validity = D.getDependences( Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW); Schedule = applyGreedyFusion(Schedule, Validity); @@ -905,14 +918,20 @@ static void runIslScheduleOptimizer( /*PatternOpts=*/!HasUserTransformation && PMBasedOpts, /*Postopts=*/!HasUserTransformation && EnablePostopts, /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE, - DepsChanged}; - if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) { + DepsChanged, + MaxOpGuard}; + if (!Schedule.is_null() && (OAI.PatternOpts || OAI.Postopts || OAI.Prevect)) { Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); Schedule = hoistExtensionNodes(Schedule); POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations")); walkScheduleTreeForStatistics(Schedule, 2); } + if (MaxOpGuard.hasQuotaExceeded()) { + POLLY_DEBUG(dbgs() << "Schedule optimizer calculation exceeds ISL quota\n"); + return; + } + // Skip profitability check if user transformation(s) have been applied. if (!HasUserTransformation && !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule)) diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp index 3f3630027e6e3..c95c55858f038 100644 --- a/polly/lib/Transform/ScheduleTreeTransform.cpp +++ b/polly/lib/Transform/ScheduleTreeTransform.cpp @@ -972,6 +972,9 @@ BandAttr *polly::getBandAttr(isl::schedule_node MarkOrBand) { } isl::schedule polly::hoistExtensionNodes(isl::schedule Sched) { + if (Sched.is_null()) + return {}; + // If there is no extension node in the first place, return the original // schedule tree. if (!containsExtensionNode(Sched)) @@ -1126,6 +1129,8 @@ isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange, isl::union_set polly::getIsolateOptions(isl::set IsolateDomain, unsigned OutDimsNum) { + if (IsolateDomain.is_null()) + return {}; unsigned Dims = unsignedFromIslSize(IsolateDomain.tuple_dim()); assert(OutDimsNum <= Dims && "The isl::set IsolateDomain is used to describe the range of schedule " diff --git a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll new file mode 100644 index 0000000000000..0bc3c2cf642e8 --- /dev/null +++ b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll @@ -0,0 +1,37 @@ +; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine -passes=polly-opt-isl -polly-debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +define void @ham(ptr %arg, ptr %arg1, i32 %arg2, i32 %arg3, ptr %arg4, i32 %arg5, i32 %arg6) { +bb: + %getelementptr = getelementptr [7 x float], ptr null, i32 0, i32 %arg3 + br label %bb7 + +bb7: ; preds = %bb11, %bb + %phi = phi i32 [ 0, %bb ], [ %add16, %bb11 ] + br label %bb8 + +bb8: ; preds = %bb8, %bb7 + %phi9 = phi i32 [ 0, %bb7 ], [ %add, %bb8 ] + %getelementptr10 = getelementptr [7 x float], ptr null, i32 0, i32 %phi9 + store float 0.000000e+00, ptr %getelementptr10, align 4 + %add = add i32 %phi9, 1 + %icmp = icmp eq i32 %phi9, 0 + br i1 %icmp, label %bb8, label %bb11 + +bb11: ; preds = %bb8 + %load = load float, ptr %getelementptr, align 4 + store float %load, ptr %arg4, align 4 + %getelementptr12 = getelementptr [7 x float], ptr null, i32 0, i32 %arg5 + %load13 = load float, ptr %getelementptr12, align 4 + store float %load13, ptr %arg, align 4 + %getelementptr14 = getelementptr [7 x float], ptr null, i32 0, i32 %arg6 + %load15 = load float, ptr %getelementptr14, align 4 + store float %load15, ptr %arg1, align 4 + %add16 = add i32 %phi, 1 + %icmp17 = icmp ne i32 %phi, %arg2 + br i1 %icmp17, label %bb7, label %bb18 + +bb18: ; preds = %bb11 + ret void +} +; CHECK:Schedule optimizer calculation exceeds ISL quota From 3d3fab17f5ea8a14eb390f53075c094f5e1f19fa Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Mon, 3 Nov 2025 07:37:13 -0600 Subject: [PATCH 007/313] [flang][OpenMP] Use OmpDirectiveSpecification in ALLOCATE (#165865) The ALLOCATE directive has two forms: - A declarative form with a standalone directive: ``` !$OMP ALLOCATE (variable-list-item...) ``` - An executable form that consists of several directives followed by an ALLOCATE statement: ``` !$OMP ALLOCATE (variable-list-item...) !$OMP ALLOCATE (variable-list-item...) ... ALLOCATE (...) ``` The second form was deprecated in OpenMP 5.2 in favor of the ALLOCATORS construct. Since in the parse tree every type corresponding to a directive only corresponds to a single directive, the executable form is represented by a sequence of nested OmpAllocateDirectives, e.g. ``` !$OMP ALLOCATE(x) !$OMP ALLOCATE(y) ALLOCATE(x, y) ``` will become ``` OmpAllocateDirective |- ALLOCATE(x) // begin directive `- OmpAllocateDirective // block |- ALLOCATE(y) // begin directive `- ALLOCATE(x, y) // block ``` With this change all AST nodes for directives use OmpDirectiveSpecification as the directive representation. --- flang/examples/FeatureList/FeatureList.cpp | 3 +- flang/include/flang/Parser/dump-parse-tree.h | 3 +- flang/include/flang/Parser/openmp-utils.h | 31 +- flang/include/flang/Parser/parse-tree.h | 56 ++-- flang/lib/Lower/OpenMP/OpenMP.cpp | 18 +- flang/lib/Parser/openmp-parsers.cpp | 46 +-- flang/lib/Parser/openmp-utils.cpp | 39 +++ flang/lib/Parser/unparse.cpp | 26 +- flang/lib/Semantics/canonicalize-omp.cpp | 156 ++++++++-- flang/lib/Semantics/check-omp-structure.cpp | 291 +++++++++--------- flang/lib/Semantics/check-omp-structure.h | 27 +- flang/lib/Semantics/resolve-directives.cpp | 58 ++-- flang/lib/Semantics/resolve-names.cpp | 4 +- .../Todo/omp-declarative-allocate-align.f90 | 2 +- .../OpenMP/Todo/omp-declarative-allocate.f90 | 2 +- .../Parser/OpenMP/allocate-align-tree.f90 | 48 +-- .../Parser/OpenMP/allocate-tree-spec-part.f90 | 63 ++-- flang/test/Parser/OpenMP/allocate-tree.f90 | 80 ++--- flang/test/Parser/OpenMP/allocate-unparse.f90 | 18 +- .../Semantics/OpenMP/allocate-align01.f90 | 2 +- .../Semantics/OpenMP/allocate-directive.f90 | 2 +- flang/test/Semantics/OpenMP/allocate01.f90 | 2 +- flang/test/Semantics/OpenMP/allocate02.f90 | 1 + flang/test/Semantics/OpenMP/allocate03.f90 | 1 + flang/test/Semantics/OpenMP/allocate06.f90 | 2 +- flang/test/Semantics/OpenMP/allocate10.f90 | 2 +- flang/test/Semantics/OpenMP/allocate12.f90 | 16 + 27 files changed, 587 insertions(+), 412 deletions(-) create mode 100644 flang/test/Semantics/OpenMP/allocate12.f90 diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp index 225a6558ef956..ef58da61e371b 100644 --- a/flang/examples/FeatureList/FeatureList.cpp +++ b/flang/examples/FeatureList/FeatureList.cpp @@ -445,6 +445,7 @@ struct NodeVisitor { READ_FEATURE(ObjectDecl) READ_FEATURE(OldParameterStmt) READ_FEATURE(OmpAlignedClause) + READ_FEATURE(OmpAllocateDirective) READ_FEATURE(OmpBeginDirective) READ_FEATURE(OmpBeginLoopDirective) READ_FEATURE(OmpBeginSectionsDirective) @@ -541,7 +542,6 @@ struct NodeVisitor { READ_FEATURE(OpenMPCancellationPointConstruct) READ_FEATURE(OpenMPConstruct) READ_FEATURE(OpenMPCriticalConstruct) - READ_FEATURE(OpenMPDeclarativeAllocate) READ_FEATURE(OpenMPDeclarativeConstruct) READ_FEATURE(OpenMPDeclareReductionConstruct) READ_FEATURE(OpenMPDeclareSimdConstruct) @@ -550,7 +550,6 @@ struct NodeVisitor { READ_FEATURE(OmpAtomicDefaultMemOrderClause) READ_FEATURE(OpenMPFlushConstruct) READ_FEATURE(OpenMPLoopConstruct) - READ_FEATURE(OpenMPExecutableAllocate) READ_FEATURE(OpenMPAllocatorsConstruct) READ_FEATURE(OpenMPRequiresConstruct) READ_FEATURE(OpenMPSimpleStandaloneConstruct) diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index a7398a4ef970f..de2716410d6cd 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -512,6 +512,7 @@ class ParseTreeDumper { NODE(parser, OmpAlignModifier) NODE(parser, OmpAllocateClause) NODE(OmpAllocateClause, Modifier) + NODE(parser, OmpAllocateDirective) NODE(parser, OmpAllocatorComplexModifier) NODE(parser, OmpAllocatorSimpleModifier) NODE(parser, OmpAlwaysModifier) @@ -739,7 +740,6 @@ class ParseTreeDumper { NODE(parser, OpenMPCancellationPointConstruct) NODE(parser, OpenMPConstruct) NODE(parser, OpenMPCriticalConstruct) - NODE(parser, OpenMPDeclarativeAllocate) NODE(parser, OpenMPDeclarativeAssumes) NODE(parser, OpenMPDeclarativeConstruct) NODE(parser, OpenMPDeclareMapperConstruct) @@ -748,7 +748,6 @@ class ParseTreeDumper { NODE(parser, OpenMPDeclareTargetConstruct) NODE(parser, OpenMPDepobjConstruct) NODE(parser, OpenMPDispatchConstruct) - NODE(parser, OpenMPExecutableAllocate) NODE(parser, OpenMPFlushConstruct) NODE(parser, OpenMPGroupprivate) NODE(parser, OpenMPLoopConstruct) diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h index 49db091af93a7..8fa4a84aff06d 100644 --- a/flang/include/flang/Parser/openmp-utils.h +++ b/flang/include/flang/Parser/openmp-utils.h @@ -22,6 +22,7 @@ #include #include #include +#include namespace Fortran::parser::omp { @@ -33,23 +34,6 @@ template constexpr auto addr_if(const std::optional &x) { } namespace detail { -using D = llvm::omp::Directive; - -template // -struct ConstructId { - static constexpr llvm::omp::Directive id{D::OMPD_unknown}; -}; - -#define MAKE_CONSTR_ID(Construct, Id) \ - template <> struct ConstructId { \ - static constexpr llvm::omp::Directive id{Id}; \ - } - -MAKE_CONSTR_ID(OpenMPDeclarativeAllocate, D::OMPD_allocate); -MAKE_CONSTR_ID(OpenMPExecutableAllocate, D::OMPD_allocate); - -#undef MAKE_CONSTR_ID - struct DirectiveNameScope { static OmpDirectiveName MakeName(CharBlock source = {}, llvm::omp::Directive id = llvm::omp::Directive::OMPD_unknown) { @@ -97,9 +81,6 @@ struct DirectiveNameScope { } else if constexpr (TupleTrait) { if constexpr (std::is_base_of_v) { return std::get(x.t).DirName(); - } else if constexpr (std::is_same_v || - std::is_same_v) { - return MakeName(std::get(x.t).source, ConstructId::id); } else { return GetFromTuple( x.t, std::make_index_sequence>{}); @@ -139,6 +120,9 @@ template OmpDirectiveName GetOmpDirectiveName(const T &x) { return detail::DirectiveNameScope::GetOmpDirectiveName(x); } +const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x); +const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x); + const OmpObjectList *GetOmpObjectList(const OmpClause &clause); template @@ -158,6 +142,13 @@ const OmpCombinerExpression *GetCombinerExpr( const OmpReductionSpecifier &rspec); const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init); +struct OmpAllocateInfo { + std::vector dirs; + const ExecutionPartConstruct *body{nullptr}; +}; + +OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x); + } // namespace Fortran::parser::omp #endif // FORTRAN_PARSER_OPENMP_UTILS_H diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 4dd5e84f60dfe..8c7578f7a1941 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -5151,17 +5151,42 @@ struct OpenMPThreadprivate { CharBlock source; }; -// 2.11.3 allocate -> ALLOCATE (variable-name-list) [clause] -struct OpenMPDeclarativeAllocate { - TUPLE_CLASS_BOILERPLATE(OpenMPDeclarativeAllocate); - CharBlock source; - std::tuple, OmpClauseList> t; +// Ref: [4.5:310-312], [5.0:156-158], [5.1:181-184], [5.2:176-177], +// [6.0:310-312] +// +// allocate-directive -> +// ALLOCATE (variable-list-item...) | // since 4.5 +// ALLOCATE (variable-list-item...) // since 5.0, until 5.1 +// ... +// allocate-stmt +// +// The first form is the "declarative-allocate", and is a declarative +// directive. The second is the "executable-allocate" and is an executable +// directive. The executable form was deprecated in 5.2. +// +// The executable-allocate consists of several ALLOCATE directives. Since +// in the parse tree every type corresponding to a directive only corresponds +// to a single directive, the executable form is represented by a sequence +// of nested OmpAlocateDirectives, e.g. +// !$OMP ALLOCATE(x) +// !$OMP ALLOCATE(y) +// ALLOCATE(x, y) +// will become +// OmpAllocateDirective +// |- ALLOCATE(x) // begin directive +// `- OmpAllocateDirective // block +// |- ALLOCATE(y) // begin directive +// `- ALLOCATE(x, y) // block +// +// The block in the declarative-allocate will be empty. +struct OmpAllocateDirective : public OmpBlockConstruct { + INHERITED_TUPLE_CLASS_BOILERPLATE(OmpAllocateDirective, OmpBlockConstruct); }; struct OpenMPDeclarativeConstruct { UNION_CLASS_BOILERPLATE(OpenMPDeclarativeConstruct); CharBlock source; - std::variant ALLOCATE [(variable-name-list)] [clause] -// [ALLOCATE (variable-name-list) [clause] [...]] -// allocate-statement -// clause -> allocator-clause -struct OpenMPExecutableAllocate { - TUPLE_CLASS_BOILERPLATE(OpenMPExecutableAllocate); - CharBlock source; - std::tuple, OmpClauseList, - std::optional>, - Statement> - t; -}; - // Ref: [5.2:180-181], [6.0:315] // // allocators-construct -> @@ -5342,9 +5354,9 @@ struct OpenMPConstruct { UNION_CLASS_BOILERPLATE(OpenMPConstruct); std::variant + OpenMPAtomicConstruct, OmpAllocateDirective, OpenMPDispatchConstruct, + OpenMPUtilityConstruct, OpenMPAllocatorsConstruct, OpenMPAssumeConstruct, + OpenMPCriticalConstruct> u; }; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 71067283d13f7..ad456d89bc432 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -3503,12 +3503,12 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, lower::pft::Evaluation &eval, const parser::OpenMPUtilityConstruct &); -static void -genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPDeclarativeAllocate &declarativeAllocate) { +static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, + const parser::OmpAllocateDirective &allocate) { if (!semaCtx.langOptions().OpenMPSimd) - TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate"); + TODO(converter.getCurrentLocation(), "OmpAllocateDirective"); } static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, @@ -3899,14 +3899,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct"); } -static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, - const parser::OpenMPExecutableAllocate &execAllocConstruct) { - if (!semaCtx.langOptions().OpenMPSimd) - TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate"); -} - static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index a9de26ea09ff8..4374acbbe51bf 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -1778,6 +1778,31 @@ struct OmpBlockConstructParser { llvm::omp::Directive dir_; }; +struct OmpDeclarativeAllocateParser { + using resultType = OmpAllocateDirective; + + std::optional Parse(ParseState &state) const { + constexpr llvm::omp::Directive dir{llvm::omp::Directive::OMPD_allocate}; + if (auto &&begin{attempt(OmpBeginDirectiveParser(dir)).Parse(state)}) { + Block empty; + auto end{maybe(OmpEndDirectiveParser{dir}).Parse(state)}; + return OmpAllocateDirective{std::move(*begin), std::move(empty), + llvm::transformOptional(std::move(*end), + [](auto &&s) { return OmpEndDirective(std::move(s)); })}; + } + return std::nullopt; + } +}; + +struct OmpExecutableAllocateParser { + using resultType = OmpAllocateDirective; + + std::optional Parse(ParseState &state) const { + OmpStatementConstructParser p{llvm::omp::Directive::OMPD_allocate}; + return construct(p).Parse(state); + } +}; + TYPE_PARSER(sourced(construct( OmpStatementConstructParser{llvm::omp::Directive::OMPD_allocators}))) @@ -2044,14 +2069,6 @@ TYPE_PARSER(construct(OmpStylizedExpressionParser{})) TYPE_PARSER(sourced(construct( OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical}))) -// 2.11.3 Executable Allocate directive -TYPE_PARSER(sourced(construct( - verbatim("ALLOCATE"_tok), maybe(parenthesized(Parser{})), - Parser{}, - maybe(nonemptyList(startOmpLine >> Parser{})) / - endOmpLine, - statement(allocateStmt)))) - // 2.8.2 Declare Simd construct TYPE_PARSER(sourced(construct( predicated(Parser{}, @@ -2077,13 +2094,6 @@ TYPE_PARSER(sourced( // IsDirective(llvm::omp::Directive::OMPD_threadprivate)) >= Parser{}))) -// 2.11.3 Declarative Allocate directive -TYPE_PARSER( - sourced(construct(verbatim("ALLOCATE"_tok), - maybe(parenthesized(Parser{})), - Parser{})) / - lookAhead(endOmpLine / !statement(allocateStmt))) - // Assumes Construct TYPE_PARSER(sourced(construct( predicated(OmpDirectiveNameParser{}, @@ -2106,7 +2116,7 @@ TYPE_PARSER( construct( Parser{}) || construct( - Parser{}) || + sourced(OmpDeclarativeAllocateParser{})) || construct( Parser{}) || construct( @@ -2194,6 +2204,8 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US, withMessage("expected OpenMP construct"_err_en_US, first(construct(Parser{}), construct(Parser{}), + construct( + sourced(OmpExecutableAllocateParser{})), construct(Parser{}), // OmpBlockConstruct is attempted before // OpenMPStandaloneConstruct to resolve !$OMP ORDERED @@ -2201,9 +2213,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US, construct(Parser{}), construct(Parser{}), construct(Parser{}), - construct(Parser{}), construct(Parser{}), - construct(Parser{}), construct(Parser{}), construct(Parser{})))) diff --git a/flang/lib/Parser/openmp-utils.cpp b/flang/lib/Parser/openmp-utils.cpp index 95ad3f60770f5..b9d3763cdd06d 100644 --- a/flang/lib/Parser/openmp-utils.cpp +++ b/flang/lib/Parser/openmp-utils.cpp @@ -22,6 +22,25 @@ namespace Fortran::parser::omp { +const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x) { + if (auto *y = std::get_if(&x.u)) { + if (auto *z{std::get_if>( + &y->u)}) { + return &z->value(); + } + } + return nullptr; +} + +const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x) { + if (auto *y{std::get_if(&x.u)}) { + if (auto *z{std::get_if>(&y->u)}) { + return &z->value(); + } + } + return nullptr; +} + const OmpObjectList *GetOmpObjectList(const OmpClause &clause) { // Clauses with OmpObjectList as its data member using MemberObjectListClauses = std::tuple(x.t)}; + if (!body.empty()) { + if (auto *omp{GetOmp(body.front())}) { + if (auto *ad{std::get_if(&omp->u)}) { + return SplitOmpAllocateHelper(n, *ad); + } + } + n.body = &body.front(); + } +} + +OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x) { + OmpAllocateInfo info; + SplitOmpAllocateHelper(info, x); + return info; +} + } // namespace Fortran::parser::omp diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 9255c4e1136bc..84123030195e9 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2482,30 +2482,8 @@ class UnparseVisitor { Unparse(static_cast(x)); } - void Unparse(const OpenMPExecutableAllocate &x) { - const auto &fields = - std::get>>( - x.t); - if (fields) { - for (const auto &decl : *fields) { - Walk(decl); - } - } - BeginOpenMP(); - Word("!$OMP ALLOCATE"); - Walk(" (", std::get>(x.t), ")"); - Walk(std::get(x.t)); - Put("\n"); - EndOpenMP(); - Walk(std::get>(x.t)); - } - void Unparse(const OpenMPDeclarativeAllocate &x) { - BeginOpenMP(); - Word("!$OMP ALLOCATE"); - Walk(" (", std::get>(x.t), ")"); - Walk(std::get(x.t)); - Put("\n"); - EndOpenMP(); + void Unparse(const OmpAllocateDirective &x) { + Unparse(static_cast(x)); } void Unparse(const OpenMPAllocatorsConstruct &x) { Unparse(static_cast(x)); diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp index c884658bf464a..a11c5250b1ab4 100644 --- a/flang/lib/Semantics/canonicalize-omp.cpp +++ b/flang/lib/Semantics/canonicalize-omp.cpp @@ -51,8 +51,6 @@ class CanonicalizationOfOmp { } // Block list } - void Post(parser::ExecutionPart &body) { RewriteOmpAllocations(body); } - // Pre-visit all constructs that have both a specification part and // an execution part, and store the connection between the two. bool Pre(parser::BlockConstruct &x) { @@ -88,6 +86,7 @@ class CanonicalizationOfOmp { void Post(parser::SpecificationPart &spec) { CanonicalizeUtilityConstructs(spec); + CanonicalizeAllocateDirectives(spec); } void Post(parser::OmpMapClause &map) { CanonicalizeMapModifiers(map); } @@ -239,33 +238,138 @@ class CanonicalizationOfOmp { } } - void RewriteOmpAllocations(parser::ExecutionPart &body) { - // Rewrite leading declarative allocations so they are nested - // within their respective executable allocate directive - // - // Original: - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - // ExecutionPartConstruct -> OpenMPExecutableAllocate - // - // After rewriting: - // ExecutionPartConstruct -> OpenMPExecutableAllocate - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - for (auto it = body.v.rbegin(); it != body.v.rend();) { - if (auto *exec = GetOmpIf(*(it++))) { - parser::OpenMPDeclarativeAllocate *decl; - std::list subAllocates; - while (it != body.v.rend() && - (decl = GetOmpIf(*it))) { - subAllocates.push_front(std::move(*decl)); - it = decltype(it)(body.v.erase(std::next(it).base())); + // Canonicalization of allocate directives + // + // In OpenMP 5.0 and 5.1 the allocate directive could either be a declarative + // one or an executable one. As usual in such cases, this poses a problem + // when the directive appears at the boundary between the specification part + // and the execution part. + // The executable form can actually consist of several adjacent directives, + // whereas the declarative form is always standalone. Additionally, the + // executable form must be associated with an allocate statement. + // + // The parser tries to parse declarative statements first, so in the + // following case, the two directives will be declarative, even though + // they should be treated as a single executable form: + // integer, allocatable :: x, y ! Specification + // !$omp allocate(x) + // !$omp allocate(y) + // allocate(x, y) ! Execution + // + void CanonicalizeAllocateDirectives(parser::SpecificationPart &spec) { + auto found = blockForSpec_.find(&spec); + if (found == blockForSpec_.end()) { + // There is no corresponding execution part, so there is nothing to do. + return; + } + parser::Block &block = *found->second; + + auto isAllocateStmt = [](const parser::ExecutionPartConstruct &epc) { + if (auto *ec = std::get_if(&epc.u)) { + if (auto *as = + std::get_if>(&ec->u)) { + return std::holds_alternative< + common::Indirection>(as->statement.u); + } + } + return false; + }; + + if (!block.empty() && isAllocateStmt(block.front())) { + // There are two places where an OpenMP declarative construct can + // show up in the tuple in specification part: + // (1) in std::list, or + // (2) in std::list. + // The case (1) is only possible if the list (2) is empty. + + auto &omps = + std::get>(spec.t); + auto &decls = std::get>(spec.t); + + if (!decls.empty()) { + MakeExecutableAllocateFromDecls(decls, block); + } else { + MakeExecutableAllocateFromOmps(omps, block); + } + } + } + + parser::ExecutionPartConstruct EmbedInExec( + parser::OmpAllocateDirective *alo, parser::ExecutionPartConstruct &&epc) { + // Nest current epc inside the allocate directive. + std::get(alo->t).push_front(std::move(epc)); + // Set the new epc to be the ExecutionPartConstruct made from + // the allocate directive. + parser::OpenMPConstruct opc(std::move(*alo)); + common::Indirection ind(std::move(opc)); + parser::ExecutableConstruct ec(std::move(ind)); + return parser::ExecutionPartConstruct(std::move(ec)); + } + + void MakeExecutableAllocateFromDecls( + std::list &decls, parser::Block &body) { + using OpenMPDeclarativeConstruct = + common::Indirection; + + auto getAllocate = [](parser::DeclarationConstruct *dc) { + if (auto *sc = std::get_if(&dc->u)) { + if (auto *odc = std::get_if(&sc->u)) { + if (auto *alo = + std::get_if(&odc->value().u)) { + return alo; + } + } + } + return static_cast(nullptr); + }; + + std::list::reverse_iterator rlast = [&]() { + for (auto rit = decls.rbegin(), rend = decls.rend(); rit != rend; ++rit) { + if (getAllocate(&*rit) == nullptr) { + return rit; } - if (!subAllocates.empty()) { - std::get>>( - exec->t) = {std::move(subAllocates)}; + } + return decls.rend(); + }(); + + if (rlast != decls.rbegin()) { + // We have already checked that the first statement in body is + // ALLOCATE. + parser::ExecutionPartConstruct epc(std::move(body.front())); + for (auto rit = decls.rbegin(); rit != rlast; ++rit) { + epc = EmbedInExec(getAllocate(&*rit), std::move(epc)); + } + + body.pop_front(); + body.push_front(std::move(epc)); + decls.erase(rlast.base(), decls.end()); + } + } + + void MakeExecutableAllocateFromOmps( + std::list &omps, + parser::Block &body) { + using OpenMPDeclarativeConstruct = parser::OpenMPDeclarativeConstruct; + + std::list::reverse_iterator rlast = [&]() { + for (auto rit = omps.rbegin(), rend = omps.rend(); rit != rend; ++rit) { + if (!std::holds_alternative(rit->u)) { + return rit; } } + return omps.rend(); + }(); + + if (rlast != omps.rbegin()) { + parser::ExecutionPartConstruct epc(std::move(body.front())); + for (auto rit = omps.rbegin(); rit != rlast; ++rit) { + epc = EmbedInExec( + &std::get(rit->u), std::move(epc)); + } + + body.pop_front(); + body.push_front(std::move(epc)); + omps.erase(rlast.base(), omps.end()); } } diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 3ea8e5b8cd2b0..e7e3f4d886b34 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -179,6 +179,22 @@ void OmpStructureChecker::Leave(const parser::BlockConstruct &x) { } } +void OmpStructureChecker::Enter(const parser::SpecificationPart &) { + partStack_.push_back(PartKind::SpecificationPart); +} + +void OmpStructureChecker::Leave(const parser::SpecificationPart &) { + partStack_.pop_back(); +} + +void OmpStructureChecker::Enter(const parser::ExecutionPart &) { + partStack_.push_back(PartKind::ExecutionPart); +} + +void OmpStructureChecker::Leave(const parser::ExecutionPart &) { + partStack_.pop_back(); +} + // Use when clause falls under 'struct OmpClause' in 'parse-tree.h'. #define CHECK_SIMPLE_CLAUSE(X, Y) \ void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \ @@ -720,18 +736,10 @@ template struct DirectiveSpellingVisitor { return std::get(t).DirName(); } - bool Pre(const parser::OpenMPDeclarativeAllocate &x) { - checker_(std::get(x.t).source, Directive::OMPD_allocate); - return false; - } bool Pre(const parser::OpenMPDispatchConstruct &x) { checker_(GetDirName(x.t).source, Directive::OMPD_dispatch); return false; } - bool Pre(const parser::OpenMPExecutableAllocate &x) { - checker_(std::get(x.t).source, Directive::OMPD_allocate); - return false; - } bool Pre(const parser::OpenMPAllocatorsConstruct &x) { checker_(GetDirName(x.t).source, Directive::OMPD_allocators); return false; @@ -1667,11 +1675,6 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) { dirContext_.pop_back(); } -static std::pair -getAllocateStmtAndSource(const parser::Statement &stmt) { - return {&stmt.statement, stmt.source}; -} - static std::pair getAllocateStmtAndSource(const parser::ExecutionPartConstruct *epc) { if (SourcedActionStmt as{GetActionStmt(epc)}) { @@ -1699,19 +1702,12 @@ static UnorderedSymbolSet GetNonComponentSymbols( return symbols; } -static const parser::OmpObjectList &GetObjectsOrEmpty( - const std::optional &maybeObjects) { - static parser::OmpObjectList empty{std::list{}}; - if (maybeObjects) { - return *maybeObjects; - } - return empty; -} +void OmpStructureChecker::CheckIndividualAllocateDirective( + const parser::OmpAllocateDirective &x, bool isExecutable) { + const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()}; + const parser::OmpDirectiveName &dirName{beginSpec.DirName()}; -void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, - const parser::OmpObjectList &objects, - const parser::OmpClauseList &clauses) { - const Scope &thisScope{context_.FindScope(source)}; + const Scope &thisScope{context_.FindScope(dirName.source)}; auto maybeHasPredefinedAllocator{[&](const parser::OmpClause *calloc) { // Return "true" if the ALLOCATOR clause was provided with an argument @@ -1740,7 +1736,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, const auto *allocator{[&]() { // Can't use FindClause in Enter (because clauses haven't been visited // yet). - for (const parser::OmpClause &c : clauses.v) { + for (const parser::OmpClause &c : beginSpec.Clauses().v) { if (c.Id() == llvm::omp::Clause::OMPC_allocator) { return &c; } @@ -1752,7 +1748,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, bool hasDynAllocators{ HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)}; if (!allocator && !hasDynAllocators) { - context_.Say(source, + context_.Say(dirName.source, "An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US); } } @@ -1766,7 +1762,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, : "a named common block or has SAVE attribute"}; auto checkSymbol{[&](const Symbol &symbol, parser::CharBlock source) { - if (!inExecutableAllocate_) { + if (!isExecutable) { // For structure members, the scope is the derived type, which is // never "this" scope. Ignore this check for members, they will be // flagged anyway. @@ -1802,37 +1798,130 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, } }}; - for (const parser::OmpObject &object : objects.v) { - parser::CharBlock objSource{[&]() { - if (auto &&maybeSource{GetObjectSource(object)}) { - return *maybeSource; - } - return source; - }()}; - if (const Symbol *symbol{GetObjectSymbol(object)}) { + for (const parser::OmpArgument &arg : beginSpec.Arguments().v) { + const parser::OmpObject *object{GetArgumentObject(arg)}; + if (!object) { + context_.Say(arg.source, + "An argument to ALLOCATE directive must be a variable list item"_err_en_US); + continue; + } + + if (const Symbol *symbol{GetObjectSymbol(*object)}) { if (!IsTypeParamInquiry(*symbol)) { - checkSymbol(*symbol, objSource); + checkSymbol(*symbol, arg.source); + } + CheckVarIsNotPartOfAnotherVar(dirName.source, *object); + } + } +} + +void OmpStructureChecker::CheckExecutableAllocateDirective( + const parser::OmpAllocateDirective &x) { + parser::omp::OmpAllocateInfo info{SplitOmpAllocate(x)}; + + auto [allocStmt, allocSource]{getAllocateStmtAndSource(info.body)}; + if (!allocStmt) { + // This has been diagnosed already. + return; + } + + UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)}; + SymbolSourceMap directiveSyms; + bool hasEmptyList{false}; + + for (const parser::OmpAllocateDirective *ompAlloc : info.dirs) { + const parser::OmpDirectiveSpecification &spec{DEREF(ompAlloc).BeginDir()}; + if (spec.Arguments().v.empty()) { + if (hasEmptyList && info.dirs.size() > 1) { + context_.Say(spec.DirName().source, + "If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items"_err_en_US); + } + hasEmptyList = true; + } + for (const parser::OmpArgument &arg : spec.Arguments().v) { + if (auto *sym{GetArgumentSymbol(arg)}) { + // Ignore these checks for structure members. They are not allowed + // in the first place, so don't tell the users that they need to + // be specified somewhere, + if (IsStructureComponent(*sym)) { + continue; + } + if (auto f{directiveSyms.find(sym)}; f != directiveSyms.end()) { + parser::MessageFormattedText txt( + "A list item on an executable ALLOCATE may only be specified once"_err_en_US); + parser::Message message(arg.source, txt); + message.Attach(f->second, "The list item was specified here"_en_US); + context_.Say(std::move(message)); + } else { + directiveSyms.insert(std::make_pair(sym, arg.source)); + } + + if (auto f{allocateSyms.find(*sym)}; f == allocateSyms.end()) { + context_ + .Say(arg.source, + "A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement"_err_en_US) + .Attach(allocSource, "The ALLOCATE statement"_en_US); + } } - CheckVarIsNotPartOfAnotherVar(source, object); } } } -void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) { - const auto &dir{std::get(x.t)}; - PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate); +void OmpStructureChecker::Enter(const parser::OmpAllocateDirective &x) { + const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()}; + const parser::OmpDirectiveName &dirName{beginSpec.DirName()}; + PushContextAndClauseSets(dirName.source, dirName.v); + ++allocateDirectiveLevel; + + bool isExecutable{partStack_.back() == PartKind::ExecutionPart}; - if (!inExecutableAllocate_) { - const auto &dir{std::get(x.t)}; - const auto &clauses{std::get(x.t)}; - const auto &objects{ - GetObjectsOrEmpty(std::get>(x.t))}; + unsigned version{context_.langOptions().OpenMPVersion}; + if (isExecutable && allocateDirectiveLevel == 1 && version >= 52) { + context_.Warn(common::UsageWarning::OpenMPUsage, dirName.source, + "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US); + } - CheckAllocateDirective(dir.source, objects, clauses); + CheckIndividualAllocateDirective(x, isExecutable); + + if (isExecutable) { + auto isOmpAllocate{[](const parser::ExecutionPartConstruct &epc) { + if (auto *omp{GetOmp(epc)}) { + auto odn{GetOmpDirectiveName(*omp)}; + return odn.v == llvm::omp::Directive::OMPD_allocate; + } + return false; + }}; + + auto &body{std::get(x.t)}; + // The parser should put at most one statement in the body. + assert(body.size() <= 1 && "Multiple statements in allocate"); + if (body.empty()) { + context_.Say(dirName.source, + "An executable ALLOCATE directive must be associated with an ALLOCATE statement"_err_en_US); + } else { + const parser::ExecutionPartConstruct &first{body.front()}; + auto [allocStmt, _]{getAllocateStmtAndSource(&body.front())}; + if (!isOmpAllocate(first) && !allocStmt) { + parser::CharBlock source{[&]() { + if (auto &&maybeSource{parser::GetSource(first)}) { + return *maybeSource; + } + return dirName.source; + }()}; + context_.Say(source, + "The statement associated with executable ALLOCATE directive must be an ALLOCATE statement"_err_en_US); + } + } } } -void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) { +void OmpStructureChecker::Leave(const parser::OmpAllocateDirective &x) { + bool isExecutable{partStack_.back() == PartKind::ExecutionPart}; + if (isExecutable && allocateDirectiveLevel == 1) { + CheckExecutableAllocateDirective(x); + } + + --allocateDirectiveLevel; dirContext_.pop_back(); } @@ -2135,112 +2224,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::At &x) { } } -void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) { - inExecutableAllocate_ = true; - const auto &dir{std::get(x.t)}; - PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate); - - unsigned version{context_.langOptions().OpenMPVersion}; - if (version >= 52) { - context_.Warn(common::UsageWarning::OpenMPUsage, x.source, - "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US); - } - - auto &objects{ - GetObjectsOrEmpty(std::get>(x.t))}; - auto &clauses{std::get(x.t)}; - - CheckAllocateDirective( - std::get(x.t).source, objects, clauses); - - if (const auto &subDirs{ - std::get>>( - x.t)}) { - for (const auto &dalloc : *subDirs) { - const auto &dir{std::get(x.t)}; - const auto &clauses{std::get(dalloc.t)}; - const auto &objects{GetObjectsOrEmpty( - std::get>(dalloc.t))}; - CheckAllocateDirective(dir.source, objects, clauses); - } - } -} - -void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) { - auto [allocStmt, allocSource]{getAllocateStmtAndSource( - std::get>(x.t))}; - - UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)}; - SymbolSourceMap directiveSyms; - auto &objects{ - GetObjectsOrEmpty(std::get>(x.t))}; - auto emptyListCount{static_cast(objects.v.empty())}; - auto checkObjects{[&](const parser::OmpObjectList &objects, - parser::CharBlock dirSource, - parser::CharBlock allocSource) { - for (const parser::OmpObject &object : objects.v) { - parser::CharBlock objSource{[&]() { - if (auto &&maybeSource{GetObjectSource(object)}) { - return *maybeSource; - } - return dirSource; - }()}; - if (auto *sym{GetObjectSymbol(object)}) { - // Ignore these checks for structure members. They are not allowed - // in the first place, so don't tell the users that they nened to - // be specified somewhere, - if (IsStructureComponent(*sym)) { - continue; - } - if (auto f{directiveSyms.find(sym)}; f != directiveSyms.end()) { - parser::MessageFormattedText txt( - "A list item on an executable ALLOCATE may only be specified once"_err_en_US); - parser::Message message(objSource, txt); - message.Attach(f->second, "The list item was specified here"_en_US); - context_.Say(std::move(message)); - } else { - directiveSyms.insert(std::make_pair(sym, objSource)); - } - - if (auto f{allocateSyms.find(*sym)}; f == allocateSyms.end()) { - context_ - .Say(objSource, - "A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement"_err_en_US) - .Attach(allocSource, "The ALLOCATE statement"_en_US); - } - } - } - }}; - - checkObjects(objects, std::get(x.t).source, allocSource); - - const auto &subDirs{ - std::get>>( - x.t)}; - if (!subDirs) { - inExecutableAllocate_ = false; - dirContext_.pop_back(); - return; - } - - for (const parser::OpenMPDeclarativeAllocate &ompAlloc : *subDirs) { - parser::CharBlock dirSource{std::get(ompAlloc.t).source}; - auto &objects{GetObjectsOrEmpty( - std::get>(ompAlloc.t))}; - if (objects.v.empty()) { - // Only show the message once per construct. - if (++emptyListCount == 2 && subDirs->size() >= 1) { - context_.Say(dirSource, - "If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items"_err_en_US); - } - } - checkObjects(objects, dirSource, allocSource); - } - - inExecutableAllocate_ = false; - dirContext_.pop_back(); -} - void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) { const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()}; const parser::OmpDirectiveName &dirName{beginSpec.DirName()}; diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 6feb1d149c4fd..1b84bc5dda471 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -82,6 +82,11 @@ class OmpStructureChecker : public OmpStructureCheckerBase { bool Enter(const parser::BlockConstruct &); void Leave(const parser::BlockConstruct &); + void Enter(const parser::SpecificationPart &); + void Leave(const parser::SpecificationPart &); + void Enter(const parser::ExecutionPart &); + void Leave(const parser::ExecutionPart &); + void Enter(const parser::OpenMPConstruct &); void Leave(const parser::OpenMPConstruct &); void Enter(const parser::OpenMPInteropConstruct &); @@ -113,8 +118,8 @@ class OmpStructureChecker : public OmpStructureCheckerBase { void Leave(const parser::OmpDeclareVariantDirective &); void Enter(const parser::OpenMPDeclareSimdConstruct &); void Leave(const parser::OpenMPDeclareSimdConstruct &); - void Enter(const parser::OpenMPDeclarativeAllocate &); - void Leave(const parser::OpenMPDeclarativeAllocate &); + void Enter(const parser::OmpAllocateDirective &); + void Leave(const parser::OmpAllocateDirective &); void Enter(const parser::OpenMPDeclareMapperConstruct &); void Leave(const parser::OpenMPDeclareMapperConstruct &); void Enter(const parser::OpenMPDeclareReductionConstruct &); @@ -129,8 +134,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase { void Leave(const parser::OmpErrorDirective &); void Enter(const parser::OmpNothingDirective &); void Leave(const parser::OmpNothingDirective &); - void Enter(const parser::OpenMPExecutableAllocate &); - void Leave(const parser::OpenMPExecutableAllocate &); void Enter(const parser::OpenMPAllocatorsConstruct &); void Leave(const parser::OpenMPAllocatorsConstruct &); void Enter(const parser::OpenMPRequiresConstruct &); @@ -263,9 +266,9 @@ class OmpStructureChecker : public OmpStructureCheckerBase { bool CheckTargetBlockOnlyTeams(const parser::Block &); void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock); void CheckWorkdistributeBlockStmts(const parser::Block &, parser::CharBlock); - void CheckAllocateDirective(parser::CharBlock source, - const parser::OmpObjectList &objects, - const parser::OmpClauseList &clauses); + void CheckIndividualAllocateDirective( + const parser::OmpAllocateDirective &x, bool isExecutable); + void CheckExecutableAllocateDirective(const parser::OmpAllocateDirective &x); void CheckIteratorRange(const parser::OmpIteratorSpecifier &x); void CheckIteratorModifier(const parser::OmpIterator &x); @@ -373,7 +376,7 @@ class OmpStructureChecker : public OmpStructureCheckerBase { }; int directiveNest_[LastType + 1] = {0}; - bool inExecutableAllocate_{false}; + int allocateDirectiveLevel{0}; parser::CharBlock visitedAtomicSource_; SymbolSourceMap deferredNonVariables_; @@ -382,6 +385,14 @@ class OmpStructureChecker : public OmpStructureCheckerBase { std::vector loopStack_; // Scopes for scoping units. std::vector scopeStack_; + + enum class PartKind : int { + // There are also other "parts", such as internal-subprogram-part, etc, + // but we're keeping track of these two for now. + SpecificationPart, + ExecutionPart, + }; + std::vector partStack_; }; /// Find a duplicate entry in the range, and return an iterator to it. diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 03c8cb0065fd8..deb57e005a352 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -415,6 +415,18 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { return true; } + bool Pre(const parser::SpecificationPart &) { + partStack_.push_back(PartKind::SpecificationPart); + return true; + } + void Post(const parser::SpecificationPart &) { partStack_.pop_back(); } + + bool Pre(const parser::ExecutionPart &) { + partStack_.push_back(PartKind::ExecutionPart); + return true; + } + void Post(const parser::ExecutionPart &) { partStack_.pop_back(); } + bool Pre(const parser::InternalSubprogram &) { // Clear the labels being tracked in the previous scope ClearLabels(); @@ -639,8 +651,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { bool Pre(const parser::OpenMPThreadprivate &); void Post(const parser::OpenMPThreadprivate &) { PopContext(); } - bool Pre(const parser::OpenMPDeclarativeAllocate &); - void Post(const parser::OpenMPDeclarativeAllocate &) { PopContext(); } + bool Pre(const parser::OmpAllocateDirective &); bool Pre(const parser::OpenMPAssumeConstruct &); void Post(const parser::OpenMPAssumeConstruct &) { PopContext(); } @@ -651,9 +662,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { bool Pre(const parser::OpenMPDispatchConstruct &); void Post(const parser::OpenMPDispatchConstruct &) { PopContext(); } - bool Pre(const parser::OpenMPExecutableAllocate &); - void Post(const parser::OpenMPExecutableAllocate &); - bool Pre(const parser::OpenMPAllocatorsConstruct &); void Post(const parser::OpenMPAllocatorsConstruct &); @@ -998,6 +1006,14 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { targetLabels_; parser::CharBlock currentStatementSource_; + enum class PartKind : int { + // There are also other "parts", such as internal-subprogram-part, etc, + // but we're keeping track of these two for now. + SpecificationPart, + ExecutionPart, + }; + std::vector partStack_; + void AddAllocateName(const parser::Name *&object) { allocateNames_.push_back(object); } @@ -2558,11 +2574,24 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) { return true; } -bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclarativeAllocate &x) { +bool OmpAttributeVisitor::Pre(const parser::OmpAllocateDirective &x) { PushContext(x.source, llvm::omp::Directive::OMPD_allocate); - if (const auto &list{std::get>(x.t)}) { - ResolveOmpObjectList(*list, Symbol::Flag::OmpDeclarativeAllocateDirective); + assert(!partStack_.empty() && "Misplaced directive"); + + auto ompFlag{partStack_.back() == PartKind::SpecificationPart + ? Symbol::Flag::OmpDeclarativeAllocateDirective + : Symbol::Flag::OmpExecutableAllocateDirective}; + + parser::omp::OmpAllocateInfo info{parser::omp::SplitOmpAllocate(x)}; + for (const parser::OmpAllocateDirective *ad : info.dirs) { + for (const parser::OmpArgument &arg : ad->BeginDir().Arguments().v) { + if (auto *object{omp::GetArgumentObject(arg)}) { + ResolveOmpObject(*object, ompFlag); + } + } } + + PopContext(); return false; } @@ -2581,15 +2610,6 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) { return true; } -bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) { - PushContext(x.source, llvm::omp::Directive::OMPD_allocate); - const auto &list{std::get>(x.t)}; - if (list) { - ResolveOmpObjectList(*list, Symbol::Flag::OmpExecutableAllocateDirective); - } - return true; -} - bool OmpAttributeVisitor::Pre(const parser::OpenMPAllocatorsConstruct &x) { const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()}; PushContext(x.source, dirSpec.DirId()); @@ -2661,10 +2681,6 @@ bool OmpAttributeVisitor::IsNestedInDirective(llvm::omp::Directive directive) { return false; } -void OmpAttributeVisitor::Post(const parser::OpenMPExecutableAllocate &x) { - PopContext(); -} - void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) { PopContext(); } diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 220f1c96b9823..a2062ef28d52c 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1700,12 +1700,12 @@ class OmpVisitor : public virtual DeclarationVisitor { void Post(const parser::OpenMPDeclareTargetConstruct &) { SkipImplicitTyping(false); } - bool Pre(const parser::OpenMPDeclarativeAllocate &x) { + bool Pre(const parser::OmpAllocateDirective &x) { AddOmpSourceRange(x.source); SkipImplicitTyping(true); return true; } - void Post(const parser::OpenMPDeclarativeAllocate &) { + void Post(const parser::OmpAllocateDirective &) { SkipImplicitTyping(false); messageHandler().set_currStmtSource(std::nullopt); } diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 index 8daf20e1ae400..fec146ac70313 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 @@ -5,6 +5,6 @@ program main integer :: x - ! CHECK: not yet implemented: OpenMPDeclarativeAllocate + ! CHECK: not yet implemented: OmpAllocateDirective !$omp allocate(x) align(32) end diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 index e83b433d0fda0..3307eb2505b71 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 @@ -5,6 +5,6 @@ program main integer :: x, y - ! CHECK: not yet implemented: OpenMPDeclarativeAllocate + ! CHECK: not yet implemented: OmpAllocateDirective !$omp allocate(x, y) end diff --git a/flang/test/Parser/OpenMP/allocate-align-tree.f90 b/flang/test/Parser/OpenMP/allocate-align-tree.f90 index 0d247cd1ed945..d799aa10a82ff 100644 --- a/flang/test/Parser/OpenMP/allocate-align-tree.f90 +++ b/flang/test/Parser/OpenMP/allocate-align-tree.f90 @@ -16,27 +16,33 @@ program allocate_align_tree allocate(j(z), xarray(t)) end program allocate_align_tree -!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt -!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> -!CHECK-NEXT: | | | AttrSpec -> Allocatable -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'j' +!CHECK: DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt +!CHECK-NEXT: | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> +!CHECK-NEXT: | AttrSpec -> Allocatable +!CHECK-NEXT: | EntityDecl +!CHECK-NEXT: | | Name = 'j' +!CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | OmpBeginDirective +!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'j' +!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4' +!CHECK-NEXT: | | | LiteralConstant -> IntLiteralConstant = '16' +!CHECK-NEXT: | | Flags = None +!CHECK-NEXT: | Block +!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4' +!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '32' +!CHECK-NEXT: | | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block +!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt -!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray' -!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4' -!CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '32' -!CHECK-NEXT: | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' -!CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4' -!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '16' -!CHECK-NEXT: | | | AllocateStmt +!UNPARSE: !$OMP ALLOCATE(j) ALIGN(16_4) +!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALIGN(32_4) ALLOCATOR(2_8) +!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t)) -!UNPARSE: !$OMP ALLOCATE (j) ALIGN(16_4) -!UNPARSE: !$OMP ALLOCATE (xarray) ALIGN(32_4) ALLOCATOR(2_8) -!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t)) diff --git a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 index afcaf44b09f03..800e4a57d5f0e 100644 --- a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 +++ b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 @@ -17,33 +17,48 @@ program allocate_tree allocate (w, xarray(4), zarray(5, f)) end program allocate_tree -!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'f' -!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | Designator -> DataRef -> Name = +!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'f' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block !CHECK-NEXT: | ExecutionPart -> Block !CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'f=2_4' !CHECK-NEXT: | | | Variable = 'f' !CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'f' !CHECK-NEXT: | | | Expr = '2_4' !CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '2' -!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpClauseList -> -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | AllocateStmt +!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block +!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray' +!CHECK-NEXT: | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' +!CHECK-NEXT: | | | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' +!CHECK-NEXT: | | | | | | Flags = None +!CHECK-NEXT: | | | | | Block +!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray' +!CHECK-NEXT: | | | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8' +!CHECK-NEXT: | | | | | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc' +!CHECK-NEXT: | | | | | | | | Flags = None +!CHECK-NEXT: | | | | | | | Block +!CHECK-NEXT: | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | | | | | OmpClauseList -> +!CHECK-NEXT: | | | | | | | | | | Flags = None +!CHECK-NEXT: | | | | | | | | | Block +!CHECK-NEXT: | | | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt diff --git a/flang/test/Parser/OpenMP/allocate-tree.f90 b/flang/test/Parser/OpenMP/allocate-tree.f90 index bf413d591baf2..021d8104a7e62 100644 --- a/flang/test/Parser/OpenMP/allocate-tree.f90 +++ b/flang/test/Parser/OpenMP/allocate-tree.f90 @@ -7,52 +7,54 @@ program allocate_tree use omp_lib - integer, allocatable :: w, xarray(:), zarray(:, :) - integer :: z, t + integer, allocatable :: xarray(:), zarray(:, :) + integer :: z, t, w +!$omp allocate(w) allocator(omp_const_mem_alloc) t = 2 z = 3 -!$omp allocate(w) allocator(omp_const_mem_alloc) !$omp allocate(xarray) allocator(omp_large_cap_mem_alloc) !$omp allocate(zarray) allocator(omp_default_mem_alloc) !$omp allocate - allocate(w, xarray(4), zarray(t, z)) + allocate(xarray(4), zarray(t, z)) end program allocate_tree -!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt -!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> -!CHECK-NEXT: | | | AttrSpec -> Allocatable -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'w' -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'xarray' -!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '1' -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'zarray' -!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '2' - +!CHECK: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective +!CHECK-NEXT: | OmpBeginDirective +!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w' +!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8' +!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc' +!CHECK-NEXT: | | Flags = None +!CHECK-NEXT: | Block -!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpClauseList -> -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | AllocateStmt +!CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | OmpBeginDirective +!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray' +!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' +!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' +!CHECK-NEXT: | | Flags = None +!CHECK-NEXT: | Block +!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block +!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | OmpClauseList -> +!CHECK-NEXT: | | | | | | Flags = None +!CHECK-NEXT: | | | | | Block +!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt -!UNPARSE: !$OMP ALLOCATE (w) ALLOCATOR(3_8) -!UNPARSE-NEXT: !$OMP ALLOCATE (xarray) ALLOCATOR(2_8) -!UNPARSE-NEXT: !$OMP ALLOCATE (zarray) ALLOCATOR(1_8) +!UNPARSE: !$OMP ALLOCATE(w) ALLOCATOR(3_8) +!UNPARSE-NEXT: t=2_4 +!UNPARSE-NEXT: z=3_4 +!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALLOCATOR(2_8) +!UNPARSE-NEXT: !$OMP ALLOCATE(zarray) ALLOCATOR(1_8) !UNPARSE-NEXT: !$OMP ALLOCATE -!UNPARSE-NEXT: ALLOCATE(w, xarray(4_4), zarray(t,z)) +!UNPARSE-NEXT: ALLOCATE(xarray(4_4), zarray(t,z)) diff --git a/flang/test/Parser/OpenMP/allocate-unparse.f90 b/flang/test/Parser/OpenMP/allocate-unparse.f90 index 63cc35cd55082..b61a97150cad2 100644 --- a/flang/test/Parser/OpenMP/allocate-unparse.f90 +++ b/flang/test/Parser/OpenMP/allocate-unparse.f90 @@ -34,19 +34,19 @@ program allocate_unparse end program allocate_unparse !CHECK:!$OMP ALLOCATE{{[ ]*$}} -!CHECK:!$OMP ALLOCATE (x,y) -!CHECK:!$OMP ALLOCATE (x,y) ALLOCATOR(omp_default_mem_alloc) -!CHECK:!$OMP ALLOCATE (a,b) +!CHECK:!$OMP ALLOCATE(x, y) +!CHECK:!$OMP ALLOCATE(x, y) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(a, b) !CHECK:ALLOCATE(darray(a,b)) !CHECK:!$OMP ALLOCATE ALLOCATOR(omp_default_mem_alloc) !CHECK:ALLOCATE(darray(a,b)) -!CHECK:!$OMP ALLOCATE (a,b) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(a, b) ALLOCATOR(omp_default_mem_alloc) !CHECK:ALLOCATE(darray(a,b)) -!CHECK:!$OMP ALLOCATE (t) ALLOCATOR(omp_const_mem_alloc) -!CHECK:!$OMP ALLOCATE (z) ALLOCATOR(omp_default_mem_alloc) -!CHECK:!$OMP ALLOCATE (m) ALLOCATOR(omp_default_mem_alloc) -!CHECK:!$OMP ALLOCATE (n) -!CHECK:!$OMP ALLOCATE (j) ALIGN(16) +!CHECK:!$OMP ALLOCATE(t) ALLOCATOR(omp_const_mem_alloc) +!CHECK:!$OMP ALLOCATE(z) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(m) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(n) +!CHECK:!$OMP ALLOCATE(j) ALIGN(16) !CHECK:ALLOCATE(darray(z,t)) !CHECK:!$OMP ALLOCATE{{[ ]*$}} !CHECK:ALLOCATE(darray(a,b)) diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90 index 88bcd6d2f1008..4a1e60cf73fff 100644 --- a/flang/test/Semantics/OpenMP/allocate-align01.f90 +++ b/flang/test/Semantics/OpenMP/allocate-align01.f90 @@ -11,9 +11,9 @@ program allocate_align_tree integer :: z, t, xx t = 2 z = 3 + !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage] !ERROR: Must be a constant value !$omp allocate(j) align(xx) - !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage] !ERROR: The alignment should be positive !$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc) allocate(j(z), xarray(t)) diff --git a/flang/test/Semantics/OpenMP/allocate-directive.f90 b/flang/test/Semantics/OpenMP/allocate-directive.f90 index 18a14b825f00d..e34125b392bda 100644 --- a/flang/test/Semantics/OpenMP/allocate-directive.f90 +++ b/flang/test/Semantics/OpenMP/allocate-directive.f90 @@ -11,7 +11,7 @@ integer, allocatable :: a, b, m, n, t, z !$omp allocate(x, y) !$omp allocate(x, y) allocator(omp_default_mem_alloc) - + continue !$omp allocate(a, b) allocate ( a, b ) diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90 index 229fd4d6c3f95..5fe4efdd106d9 100644 --- a/flang/test/Semantics/OpenMP/allocate01.f90 +++ b/flang/test/Semantics/OpenMP/allocate01.f90 @@ -17,7 +17,7 @@ subroutine sema() !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears !$omp allocate(y) - print *, a + print *, a !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage] !$omp allocate(x) allocator(omp_default_mem_alloc) diff --git a/flang/test/Semantics/OpenMP/allocate02.f90 b/flang/test/Semantics/OpenMP/allocate02.f90 index 8f0579e810bb9..a1e684796edb2 100644 --- a/flang/test/Semantics/OpenMP/allocate02.f90 +++ b/flang/test/Semantics/OpenMP/allocate02.f90 @@ -16,6 +16,7 @@ subroutine allocate() !ERROR: At most one ALLOCATOR clause can appear on the ALLOCATE directive !$omp allocate(x, y) allocator(omp_default_mem_alloc) allocator(omp_default_mem_alloc) + continue !$omp allocate(darray) allocator(omp_default_mem_alloc) allocate ( darray(a, b) ) diff --git a/flang/test/Semantics/OpenMP/allocate03.f90 b/flang/test/Semantics/OpenMP/allocate03.f90 index e35115f3897cc..3609f38eb6ee7 100644 --- a/flang/test/Semantics/OpenMP/allocate03.f90 +++ b/flang/test/Semantics/OpenMP/allocate03.f90 @@ -17,6 +17,7 @@ subroutine allocate() !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive !$omp allocate(my_var%array) + continue !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive !$omp allocate(darray, my_var%array) allocator(omp_default_mem_alloc) diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90 index 9b57322bbadc6..272094aaaeec2 100644 --- a/flang/test/Semantics/OpenMP/allocate06.f90 +++ b/flang/test/Semantics/OpenMP/allocate06.f90 @@ -13,7 +13,7 @@ subroutine allocate() !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute !$omp allocate(darray) allocator(omp_default_mem_alloc) - + continue !$omp allocate(darray) allocator(omp_default_mem_alloc) allocate(darray(a, b)) diff --git a/flang/test/Semantics/OpenMP/allocate10.f90 b/flang/test/Semantics/OpenMP/allocate10.f90 index a9db7330296ba..0a9e85b8ae2fe 100644 --- a/flang/test/Semantics/OpenMP/allocate10.f90 +++ b/flang/test/Semantics/OpenMP/allocate10.f90 @@ -4,8 +4,8 @@ subroutine f00 integer, allocatable :: x, y continue - !ERROR: If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items !$omp allocate + !ERROR: If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items !$omp allocate allocate(x, y) end diff --git a/flang/test/Semantics/OpenMP/allocate12.f90 b/flang/test/Semantics/OpenMP/allocate12.f90 new file mode 100644 index 0000000000000..2b3b510fbf40c --- /dev/null +++ b/flang/test/Semantics/OpenMP/allocate12.f90 @@ -0,0 +1,16 @@ +!RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51 + +subroutine f00 + integer, allocatable :: x + continue + !ERROR: An executable ALLOCATE directive must be associated with an ALLOCATE statement + !$omp allocate(x) +end + +subroutine f01 + integer, allocatable :: x + continue + !$omp allocate(x) + !ERROR: The statement associated with executable ALLOCATE directive must be an ALLOCATE statement + continue +end From 8395343811ecddb14cfd0ebf273d9d588bcaa8e7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 3 Nov 2025 13:44:54 +0000 Subject: [PATCH 008/313] [X86] combineTruncate - trunc(srl(load(p),amt)) -> load(p+amt/8) - ensure we merge the full / truncated load chains (#166160) The full load might persist so ensure that the chains are merged into a token factor instead of just transferring the chain to the new load Noticed while trying to fix the regression reported from #165540 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 133406bd8e0d7..e5b2743f602da 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -54529,8 +54529,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewLoad = DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), Align(), Ld->getMemOperand()->getFlags()); - DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), - NewLoad.getValue(1)); + DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); return NewLoad; } } From 5256db32bb14a358a23251f714cc39d4d761fef1 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Mon, 3 Nov 2025 06:01:46 -0800 Subject: [PATCH 009/313] [CI] Use action from FAILED: in ninja log parser There are cases where the progress indicator does not align at all with the action printed in FAILED:. Default to just using the action there so that we can ensure the results are accurate. This still leaves some issues where we are not capturing all the log lines, but I'll look at those in a future crash. Those are also less critical since they do not cause the script to patch. Partially fixes #165131. Reviewers: DavidSpickett Reviewed By: DavidSpickett Pull Request: https://github.com/llvm/llvm-project/pull/166100 --- .ci/generate_test_report_lib.py | 10 ++++--- .ci/generate_test_report_lib_test.py | 42 +++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py index 36c95852452ac..7820fbda803d7 100644 --- a/.ci/generate_test_report_lib.py +++ b/.ci/generate_test_report_lib.py @@ -41,10 +41,12 @@ def _parse_ninja_log(ninja_log: list[str]) -> list[tuple[str, str]]: # touch test/4.stamp # # index will point to the line that starts with Failed:. The progress - # indicator is the line before this ([4/5] test/4.stamp) and contains a pretty - # printed version of the target being built (test/4.stamp). We use this line - # and remove the progress information to get a succinct name for the target. - failing_action = ninja_log[index - 1].split("] ")[1] + # indicator is sometimes the line before this ([4/5] test/4.stamp) and + # will contain a pretty printed version of the target being built + # (test/4.stamp) when accurate. We instead parse the failed line rather + # than the progress indicator as the progress indicator may not be + # aligned with the failure. + failing_action = ninja_log[index].split("FAILED: ")[1] failure_log = [] while ( index < len(ninja_log) diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py index 431e10da6405a..4068a3b7300a4 100644 --- a/.ci/generate_test_report_lib_test.py +++ b/.ci/generate_test_report_lib_test.py @@ -39,7 +39,7 @@ def test_find_failure_ninja_logs(self): self.assertEqual( failures[0], ( - "test/4.stamp", + "touch test/4.stamp", dedent( """\ FAILED: touch test/4.stamp @@ -77,7 +77,7 @@ def test_ninja_log_end(self): self.assertEqual( failures[0], ( - "test/3.stamp", + "touch test/3.stamp", dedent( """\ FAILED: touch test/3.stamp @@ -106,7 +106,7 @@ def test_ninja_log_multiple_failures(self): self.assertEqual( failures[0], ( - "test/2.stamp", + "touch test/2.stamp", dedent( """\ FAILED: touch test/2.stamp @@ -117,7 +117,7 @@ def test_ninja_log_multiple_failures(self): self.assertEqual( failures[1], ( - "test/4.stamp", + "touch test/4.stamp", dedent( """\ FAILED: touch test/4.stamp @@ -150,7 +150,7 @@ def test_ninja_log_runtimes_failure(self): self.assertEqual( failures[0], ( - "test/2.stamp", + "touch test/2.stamp", dedent( """\ FAILED: touch test/2.stamp @@ -159,6 +159,34 @@ def test_ninja_log_runtimes_failure(self): ), ) + # Test that we correctly handle cases where the FAILED: line does not + # match up with the progress indicator. + def test_ninja_log_mismatched_failed(self): + failures = generate_test_report_lib.find_failure_in_ninja_logs( + [ + [ + "[1/5] test/1.stamp", + "[2/5] test/2.stamp", + "ModuleNotFoundError: No module named 'mount_langley'", + "FAILED: tools/check-langley", + "Wow! This system is really broken!", + "[5/5] test/5.stamp", + ] + ] + ) + self.assertEqual(len(failures), 1) + self.assertEqual( + failures[0], + ( + "tools/check-langley", + dedent( + """\ + FAILED: tools/check-langley + Wow! This system is really broken!""" + ), + ), + ) + def test_title_only(self): self.assertEqual( generate_test_report_lib.generate_report("Foo", 0, [], []), @@ -448,7 +476,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self): All tests passed but another part of the build **failed**. Click on a failure below to see the details.
- test/2.stamp + touch test/2.stamp ``` FAILED: touch test/2.stamp @@ -456,7 +484,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self): ```
- test/4.stamp + touch test/4.stamp ``` FAILED: touch test/4.stamp From bf2f5773d9d50d74a4cdeac4d88762e2d9776175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Mon, 3 Nov 2025 15:22:21 +0100 Subject: [PATCH 010/313] [Clang] Make the AS of llvm.compiler.used & llvm.used elements addrspace(0) (#164432) By convention the AS of the elements of `llvm.compiler.used` & `llvm.used` is 0. However, the AS of `CGM.Int8PtrTy` is not always 0. This leaves some LLVM helpers (`appendToUsed/appendToCompilerUsed/removeFromUsedLists`) unusable. This patch makes the AS of the elements of these variables to be 0. This PR is related to https://github.com/llvm/llvm-project/pull/162660 --- clang/lib/CodeGen/CodeGenModule.cpp | 12 ++-- .../embed-bitcode-marker-with-nonzero-as.c | 2 +- .../llvm_compiler_used_elements_are_unqual.c | 64 +++++++++++++++++++ 3 files changed, 71 insertions(+), 7 deletions(-) create mode 100644 clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 0fea57b2e1799..af5be95aec1cd 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -3331,18 +3331,18 @@ static void emitUsed(CodeGenModule &CGM, StringRef Name, if (List.empty()) return; + llvm::PointerType *UnqualPtr = + llvm::PointerType::getUnqual(CGM.getLLVMContext()); + // Convert List to what ConstantArray needs. SmallVector UsedArray; UsedArray.resize(List.size()); for (unsigned i = 0, e = List.size(); i != e; ++i) { - UsedArray[i] = - llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - cast(&*List[i]), CGM.Int8PtrTy); + UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast(&*List[i]), UnqualPtr); } - if (UsedArray.empty()) - return; - llvm::ArrayType *ATy = llvm::ArrayType::get(CGM.Int8PtrTy, UsedArray.size()); + llvm::ArrayType *ATy = llvm::ArrayType::get(UnqualPtr, UsedArray.size()); auto *GV = new llvm::GlobalVariable( CGM.getModule(), ATy, false, llvm::GlobalValue::AppendingLinkage, diff --git a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c index df7118859c764..8af9708a1bfb8 100644 --- a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c +++ b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c @@ -3,6 +3,6 @@ // CHECK: @llvm.embedded.module = private addrspace(1) constant [0 x i8] zeroinitializer, section ".llvmbc", align 1 // CHECK-NEXT: @llvm.cmdline = private addrspace(1) constant [{{[0-9]+}} x i8] c"{{.*}}", section ".llvmcmd", align 1 -// CHECK-NEXT: @llvm.compiler.used = appending addrspace(1) global [5 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo.managed to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr addrspace(4))], section "llvm.metadata" +// CHECK-NEXT: @llvm.compiler.used = appending addrspace(1) global [5 x ptr] [ptr addrspacecast (ptr addrspace(1) @foo.managed to ptr), ptr addrspacecast (ptr addrspace(1) @foo to ptr), ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr), ptr addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr), ptr addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr)], section "llvm.metadata" __attribute__((managed)) int foo = 42; diff --git a/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c b/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c new file mode 100644 index 0000000000000..b6550fb1e5c77 --- /dev/null +++ b/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c @@ -0,0 +1,64 @@ +// RUN: %clang_cc1 -x c -triple x86_64-- -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=X86 +// RUN: %clang_cc1 -x c -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=AMDGCN +// RUN: %clang_cc1 -x c -triple spirv64-- -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=SPIRV +// RUN: %clang_cc1 -x c -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=SPIRV_AMD +// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple x86_64-- -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=X86 +// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=AMDGCN +// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple spirv64-- -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=SPIRV_CL +// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=SPIRV_AMD_CL +// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple x86_64-- -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=X86 +// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=AMDGCN +// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple spirv64-- -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=SPIRV_CL +// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \ +// RUN: | FileCheck %s --check-prefix=SPIRV_AMD_CL + +#ifndef __OPENCL_C_VERSION__ +#define __constant const +#endif + +static __constant __attribute__((__used__)) int foo = 42; + + +// X86: @foo = internal constant i32 42 +// X86: @llvm.compiler.used = appending global [2 x ptr] [ptr @foo, ptr @bar], section "llvm.metadata" +// +// AMDGCN: @foo = internal addrspace(4) constant i32 42 +// AMDGCN: @llvm.compiler.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @foo to ptr), ptr @bar], section "llvm.metadata" +// +// SPIRV: @foo = internal constant i32 42 +// SPIRV: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr @foo, ptr @bar], section "llvm.metadata" +// +// SPIRV_CL: @foo = internal addrspace(2) constant i32 42 +// SPIRV_CL: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(2) @foo to ptr), ptr @bar], section "llvm.metadata" +// +// SPIRV_AMD: @foo = internal addrspace(1) constant i32 42 +// SPIRV_AMD: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @foo to ptr), ptr addrspacecast (ptr addrspace(4) @bar to ptr)], section "llvm.metadata" +// +// SPIRV_AMD_CL: @foo = internal addrspace(2) constant i32 42 +// SPIRV_AMD_CL: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(2) @foo to ptr), ptr addrspacecast (ptr addrspace(4) @bar to ptr)], section "llvm.metadata" +// +// X86: define internal void @bar() #{{[0-9]}} { +// +// AMDGCN: define internal void @bar() #{{[0-9]}} { +// +// SPIRV: define internal spir_func void @bar() #{{[0-9]}} { +// +// SPIRV_CL: define internal spir_func void @bar() #{{[0-9]}} { +// +// SPIRV_AMD: define internal spir_func void @bar() addrspace(4) #{{[0-9]}} { +// +// SPIRV_AMD_CL: define internal spir_func void @bar() addrspace(4) #{{[0-9]}} { +// +static void __attribute__((__used__)) bar() { +} From 9c26170dd70865d253727e5122ce2a892d55800e Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 3 Nov 2025 08:21:32 -0600 Subject: [PATCH 011/313] [libc] Fix Linux kernel headers being included on all OS's Summary: The changes in https://github.com/llvm/llvm-project/commit/43bd7e3bb903af5076a9552f4f64cfc5d58f76ce altered how we handled including headers, this included the system on the GPU target which poisoned the include path that was curated to not include any system headers. Change this to only apply is the target OS is Linux. --- libc/CMakeLists.txt | 7 +++++-- runtimes/cmake/Modules/HandleLibC.cmake | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index ae555a256ba66..4e6b4195a9c5e 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -47,8 +47,6 @@ set(LIBC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(LIBC_ENABLE_USE_BY_CLANG OFF CACHE BOOL "Whether or not to place libc in a build directory findable by a just built clang") -set(LIBC_KERNEL_HEADERS "/usr/include" CACHE STRING "Path to Linux kernel headers") - # Defining a global namespace to enclose all libc functions. set(default_namespace "__llvm_libc") if(LLVM_VERSION_MAJOR) @@ -146,6 +144,11 @@ option(LLVM_LIBC_ALL_HEADERS "Outputs all functions in header files, regardless option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF) +if(LIBC_TARGET_OS_IS_LINUX) + set(kernel_headers "/usr/include") +endif() +set(LIBC_KERNEL_HEADERS "${kernel_headers}" CACHE STRING "Path to Linux kernel headers") + set(LIBC_ENABLE_UNITTESTS ON) set(LIBC_ENABLE_HERMETIC_TESTS ${LLVM_LIBC_FULL_BUILD}) diff --git a/runtimes/cmake/Modules/HandleLibC.cmake b/runtimes/cmake/Modules/HandleLibC.cmake index 01da5b260d3d4..f8869512f99d3 100644 --- a/runtimes/cmake/Modules/HandleLibC.cmake +++ b/runtimes/cmake/Modules/HandleLibC.cmake @@ -30,7 +30,9 @@ elseif (RUNTIMES_USE_LIBC STREQUAL "llvm-libc") check_cxx_compiler_flag(-nostdlibinc CXX_SUPPORTS_NOSTDLIBINC_FLAG) if(CXX_SUPPORTS_NOSTDLIBINC_FLAG) target_compile_options(runtimes-libc-headers INTERFACE "-nostdlibinc") - target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}") + if(LIBC_KERNEL_HEADERS) + target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}") + endif() endif() add_library(runtimes-libc-static INTERFACE) From 60e53d2cc2740c74f0a64922c6868a63c688c1c5 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Mon, 3 Nov 2025 09:36:12 -0500 Subject: [PATCH 012/313] [PowerPC] Implement 32byte indexed paired ld and st instruction (#160767) --- llvm/lib/Target/PowerPC/PPCInstrFuture.td | 8 ++++++++ .../MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt | 6 ++++++ .../Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt | 6 ++++++ llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s | 8 ++++++++ 4 files changed, 28 insertions(+) diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index da3efdc15f1e1..0c2e44e18f463 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -360,6 +360,10 @@ let Predicates = [HasVSX, IsISAFuture] in { def LXVPRLL : XForm_XTp5_RAB5<31, 621, (outs vsrprc:$XTp), (ins (memr $RA):$addr, g8rc:$RB), "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; + def LXVPB32X + : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp), + (ins (memr $RA):$addr, g8rc:$RB), + "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } let mayStore = 1 in { @@ -376,6 +380,10 @@ let Predicates = [HasVSX, IsISAFuture] in { : XForm_XTp5_RAB5<31, 749, (outs), (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; + def STXVPB32X + : XForm_XTp5_RAB5<31, 1005, (outs), + (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), + "stxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB), diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt index f5cb4b72959f9..2661ed5b04cc9 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt @@ -82,12 +82,18 @@ #CHECK: lxvprll 6, 2, 1 0x7c 0xc2 0x0c 0xda +#CHECK: lxvpb32x 2, 15, 16 +0x7c,0x4f,0x86,0xda + #CHECK: stxvprl 0, 1, 2 0x7c 0x01 0x15 0x9a #CHECK: stxvprll 6, 0, 1 0x7c 0xc0 0x0d 0xda +#CHECK: stxvpb32x 2, 15, 16 +0x7c,0x4f,0x87,0xda + #CHECK: dmxvi8gerx4 1, 2, 4 0xec,0x82,0x20,0x58 diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt index f0df8ce39021b..7fb8254ced0ac 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt @@ -76,12 +76,18 @@ #CHECK: lxvprll 6, 2, 1 0xda 0x0c 0xc2 0x7c +#CHECK: lxvpb32x 2, 15, 16 +0xda,0x86,0x4f,0x7c + #CHECK: stxvprl 0, 1, 2 0x9a 0x15 0x01 0x7c #CHECK: stxvprll 6, 0, 1 0xda 0x0d 0xc0 0x7c +#CHECK: stxvpb32x 2, 15, 16 +0xda,0x87,0x4f,0x7c + #CHECK: dmxvi8gerx4 1, 2, 4 0x58,0x20,0x82,0xec diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s index bc0683e38887c..40059c440b128 100644 --- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s +++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s @@ -105,6 +105,10 @@ # CHECK-LE: lxvprll 6, 2, 1 # encoding: [0xda,0x0c,0xc2,0x7c] lxvprll 6, 2, 1 + lxvpb32x 2, 15, 16 +#CHECK-BE: lxvpb32x 2, 15, 16 # encoding: [0x7c,0x4f,0x86,0xda] +#CHECK-LE: lxvpb32x 2, 15, 16 # encoding: [0xda,0x86,0x4f,0x7c] + # CHECK-BE: stxvprl 0, 1, 2 # encoding: [0x7c,0x01,0x15,0x9a] # CHECK-LE: stxvprl 0, 1, 2 # encoding: [0x9a,0x15,0x01,0x7c] stxvprl 0, 1, 2 @@ -113,6 +117,10 @@ # CHECK-LE: stxvprll 6, 0, 1 # encoding: [0xda,0x0d,0xc0,0x7c] stxvprll 6, 0, 1 + stxvpb32x 2, 15, 16 +#CHECK-BE: stxvpb32x 2, 15, 16 # encoding: [0x7c,0x4f,0x87,0xda] +#CHECK-LE: stxvpb32x 2, 15, 16 # encoding: [0xda,0x87,0x4f,0x7c] + dmxvi8gerx4 1, 2, 4 # CHECK-BE: dmxvi8gerx4 1, 2, 4 # encoding: [0xec,0x82,0x20,0x58] # CHECK-LE: dmxvi8gerx4 1, 2, 4 # encoding: [0x58,0x20,0x82,0xec] From 8dcd02accfb2d0217372716edb2a4f325ddb7442 Mon Sep 17 00:00:00 2001 From: Tomer Shafir Date: Mon, 3 Nov 2025 16:37:22 +0200 Subject: [PATCH 013/313] [Clang][Bytecode] Match exact void pointer deref error message (#166133) Better match exactly rather than just a prefix. --- clang/test/AST/ByteCode/cxx11.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp index 427d3a106656b..e283a7b42e554 100644 --- a/clang/test/AST/ByteCode/cxx11.cpp +++ b/clang/test/AST/ByteCode/cxx11.cpp @@ -374,7 +374,7 @@ namespace GH150709 { namespace DiscardedAddrLabel { void foo(void) { L: - *&&L; // both-error {{indirection not permitted}} \ + *&&L; // both-error {{indirection not permitted on operand of type 'void *'}} \ // both-warning {{expression result unused}} } } From 7c9f137b3ce775a5a4f9eee54700e1591a4a9f9d Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Mon, 3 Nov 2025 09:56:34 -0500 Subject: [PATCH 014/313] [ADT] Drop unused include in StringSwitch. NFC. --- llvm/include/llvm/ADT/StringSwitch.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h index 8c8d31bd4f055..5bdbb302a6d75 100644 --- a/llvm/include/llvm/ADT/StringSwitch.h +++ b/llvm/include/llvm/ADT/StringSwitch.h @@ -14,7 +14,6 @@ #define LLVM_ADT_STRINGSWITCH_H #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include #include @@ -64,7 +63,7 @@ class StringSwitch { void operator=(const StringSwitch &) = delete; void operator=(StringSwitch &&) = delete; - // Case-sensitive case matchers + // Case-sensitive case matchers. StringSwitch &Case(StringLiteral S, T Value) { CaseImpl(S, Value); return *this; From 1c094a1ce2ef15b5855b11aa85dbb2f1eea54f13 Mon Sep 17 00:00:00 2001 From: Dan Blackwell Date: Mon, 3 Nov 2025 14:59:49 +0000 Subject: [PATCH 015/313] [TSan][Test-Only] Account for race in cxa_guard_acquire.cpp test (#165853) It is possible for "Enter potentially blocking region" to come before "Enter constructor" in this test - if the thread that acquires the guard fails to reach its printf before the other thread that enters the blocking region reaches its own printf. Note that for the exit logs this inversion is not possible. This patch addresses this by allowing those two log lines to come in either order. rdar://163375661 --- compiler-rt/test/tsan/cxa_guard_acquire.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/compiler-rt/test/tsan/cxa_guard_acquire.cpp b/compiler-rt/test/tsan/cxa_guard_acquire.cpp index fc407259e8968..6050c243cb8c1 100644 --- a/compiler-rt/test/tsan/cxa_guard_acquire.cpp +++ b/compiler-rt/test/tsan/cxa_guard_acquire.cpp @@ -66,10 +66,17 @@ int main(int argc, char **argv) { printf("Enter main\n"); // If initialization is contended, the blocked thread should enter a - // potentially blocking region. + // potentially blocking region. Note that we use a DAG check because it is + // possible for Thread 1 to acquire the guard, then Thread 2 fail to acquire + // the guard then call `OnPotentiallyBlockingRegionBegin` and print "Enter + // potentially blocking region\n", before Thread 1 manages to reach "Enter + // constructor\n". This is exceptionally rare, but can be replicated by + // inserting a `sleep(1)` between `LazyInit() {` and `printf("Enter + // constructor\n");`. Due to the barrier it is not possible for the exit logs + // to be inverted. // - // CHECK-NEXT: Enter constructor - // CHECK-NEXT: Enter potentially blocking region + // CHECK-DAG: Enter constructor + // CHECK-DAG: Enter potentially blocking region // CHECK-NEXT: Exit constructor // CHECK-NEXT: Exit potentially blocking region barrier_init(&barrier, 2); From 332f9b5eeef85dca29112018ba111bf64a75d27d Mon Sep 17 00:00:00 2001 From: Robert Imschweiler Date: Mon, 3 Nov 2025 16:09:12 +0100 Subject: [PATCH 016/313] [AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for callbr instruction with inline-asm (#152161) Finishes adding inline-asm callbr support for AMDGPU, started by https://github.com/llvm/llvm-project/pull/149308. --- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 89 +++--- llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 19 +- llvm/test/CodeGen/AMDGPU/callbr.ll | 54 ++++ ...nify-divergent-exit-nodes-with-musttail.ll | 51 ++++ llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 257 ++++++++++++++++-- .../si-annotate-nested-control-flows.ll | 100 ++++++- .../si-unify-exit-multiple-unreachables.ll | 161 ++++++++++- llvm/test/CodeGen/AMDGPU/update-phi.ll | 39 +++ llvm/test/Transforms/StructurizeCFG/callbr.ll | 235 ++++++++++++++++ 9 files changed, 926 insertions(+), 79 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/callbr.ll create mode 100644 llvm/test/Transforms/StructurizeCFG/callbr.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 733c5d520fb23..706237b906cc3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( return NewRetBlock; } +static BasicBlock * +createDummyReturnBlock(Function &F, + SmallVector &ReturningBlocks) { + BasicBlock *DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + return DummyReturnBB; +} + +/// Handle conditional branch instructions (-> 2 targets) and callbr +/// instructions with N targets. +static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI, + BasicBlock *DummyReturnBB, + std::vector &Updates) { + SmallVector Successors(successors(BB)); + + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); + + Updates.reserve(Updates.size() + 2 * Successors.size() + 2); + + // 'Successors' become successors of TransitionBB instead of BB, + // and TransitionBB becomes a single successor of BB. + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); + for (BasicBlock *Successor : Successors) { + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); + } + + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); + BranchInst::Create(TransitionBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); +} + bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - if (PDT.root_size() == 0 || (PDT.root_size() == 1 && - !isa(PDT.getRoot()->getTerminator()))) + !isa(PDT.getRoot()->getTerminator()))) return false; // Loop over all of the blocks in a function, tracking all of the blocks that @@ -222,46 +260,27 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, if (HasDivergentExitBlock) UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast(BB->getTerminator())) { - - ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); - if (DummyReturnBB == nullptr) { - DummyReturnBB = BasicBlock::Create(F.getContext(), - "DummyReturnBlock", &F); - Type *RetTy = F.getReturnType(); - Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); - ReturningBlocks.push_back(DummyReturnBB); - } + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); if (BI->isUnconditional()) { BasicBlock *LoopHeaderBB = BI->getSuccessor(0); BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. - BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); - Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); - } else { // Conditional branch. - SmallVector Successors(successors(BB)); - - // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - - Updates.reserve(Updates.size() + 2 * Successors.size() + 2); - - // 'Successors' become successors of TransitionBB instead of BB, - // and TransitionBB becomes a single successor of BB. - Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); - for (BasicBlock *Successor : Successors) { - Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); - Updates.emplace_back(DominatorTree::Delete, BB, Successor); - } - - // Create a branch that will always branch to the transition block and - // references DummyReturnBB. - BB->getTerminator()->eraseFromParent(); - BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); + BranchInst::Create(LoopHeaderBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); + } else { + handleNBranch(F, BB, BI, DummyReturnBB, Updates); } Changed = true; + } else if (CallBrInst *CBI = dyn_cast(BB->getTerminator())) { + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); + + handleNBranch(F, BB, CBI, DummyReturnBB, Updates); + } else { + llvm_unreachable("unsupported block terminator"); } } diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 5f6f66a4bc213..0a8f5ea2fdae1 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { } else { // Test for successors as back edge BasicBlock *BB = N->getNodeAs(); - BranchInst *Term = cast(BB->getTerminator()); - - for (BasicBlock *Succ : Term->successors()) - if (Visited.count(Succ)) - Loops[Succ] = BB; + if (BranchInst *Term = dyn_cast(BB->getTerminator())) + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) + Loops[Succ] = BB; } } @@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { for (BasicBlock *P : predecessors(BB)) { // Ignore it if it's a branch from outside into our region entry - if (!ParentRegion->contains(P)) + if (!ParentRegion->contains(P) || !dyn_cast(P->getTerminator())) continue; Region *R = RI->getRegionFor(P); @@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { /// Run the transformation for each region found bool StructurizeCFG::run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI) { - if (R->isTopLevelRegion()) + // CallBr and its corresponding direct target blocks are for now ignored by + // this pass. This is not a limitation for the currently intended uses cases + // of callbr in the AMDGPU backend. + // Parent and child regions are not affected by this (current) restriction. + // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details. + if (R->isTopLevelRegion() || isa(R->getEntry()->getTerminator())) return false; this->DT = DT; this->TTI = TTI; Func = R->getEntry()->getParent(); - assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); ParentRegion = R; diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll new file mode 100644 index 0000000000000..253a6ec100eae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/callbr.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s + +define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) { +; CHECK-LABEL: callbr_inline_asm: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dword v0, v[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %fallthrough +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[2:3], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target +; CHECK-NEXT: ; %indirect +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[4:5], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %a = load i32, ptr %src, align 4 + callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect] +fallthrough: + store i32 %a, ptr %dst1, align 4 + br label %ret +indirect: + store i32 %a, ptr %dst2, align 4 + br label %ret +ret: + ret void +} + +define void @callbr_self_loop(i1 %c) { +; CHECK-LABEL: callbr_self_loop: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: .LBB1_1: ; %callbr +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_1 +; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target +; CHECK-NEXT: ; %callbr.target.ret +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_setpc_b64 s[30:31] + br label %callbr +callbr: + callbr void asm "", "!i"() to label %callbr [label %ret] +ret: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll index 007e3f0a6bdbc..076a99ff8588f 100644 --- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll +++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll @@ -3,6 +3,7 @@ declare void @foo(ptr) declare i1 @bar(ptr) +declare i32 @bar32(ptr) define void @musttail_call_without_return_value(ptr %p) { ; CHECK-LABEL: define void @musttail_call_without_return_value( @@ -28,6 +29,31 @@ bb.1: ret void } +define void @musttail_call_without_return_value_callbr(ptr %p) { +; CHECK-LABEL: define void @musttail_call_without_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: musttail call void @foo(ptr [[P]]) +; CHECK-NEXT: ret void +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret void +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + musttail call void @foo(ptr %p) + ret void + +bb.1: + ret void +} + define i1 @musttail_call_with_return_value(ptr %p) { ; CHECK-LABEL: define i1 @musttail_call_with_return_value( ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { @@ -51,3 +77,28 @@ bb.0: bb.1: ret i1 %load } + +define i32 @musttail_call_with_return_value_callbr(ptr %p) { +; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]]) +; CHECK-NEXT: ret i32 [[RET]] +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret i32 [[LOAD]] +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + %ret = musttail call i32 @bar32(ptr %p) + ret i32 %ret + +bb.1: + ret i32 %load +} diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 3e2e43faca5aa..df635925b87df 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -36,26 +36,60 @@ loop: br label %loop } +define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP:%.*]] [] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", ""() to label %loop [] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] +} + define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB1_3 +; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB1_2: ; %loop +; SI-NEXT: .LBB2_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB1_2 -; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock +; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_ret( ; IR-NEXT: entry: @@ -81,44 +115,93 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop.preheader +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: .LBB3_2: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1 +; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]]) +; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond = icmp eq i32 %tmp, 1 + %cond32 = zext i1 %cond to i32 + callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] + +return: + ret void +} + define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: s_cbranch_scc1 .LBB2_4 +; SI-NEXT: s_cbranch_scc1 .LBB4_4 ; SI-NEXT: ; %bb.1: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x378 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB2_2: ; %loop2 +; SI-NEXT: .LBB4_2: ; %loop2 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: s_cbranch_vccnz .LBB4_2 ; SI-NEXT: ; %bb.3: ; %Flow ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB2_4: ; %Flow2 +; SI-NEXT: .LBB4_4: ; %Flow2 ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_7 +; SI-NEXT: s_cbranch_vccz .LBB4_7 ; SI-NEXT: ; %bb.5: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, 0 -; SI-NEXT: .LBB2_6: ; %loop1 +; SI-NEXT: .LBB4_6: ; %loop1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_6 -; SI-NEXT: .LBB2_7: ; %DummyReturnBlock +; SI-NEXT: s_cbranch_vccz .LBB4_6 +; SI-NEXT: .LBB4_7: ; %DummyReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loops( ; IR-NEXT: entry: @@ -144,24 +227,78 @@ loop2: br label %loop2 } +define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loops_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB5_2: ; Inline asm indirect target +; SI-NEXT: ; %loop2.preheader +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x378 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loops_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", "r,!i"(i32 poison) +; IR-NEXT: to label [[LOOP1:%.*]] [label %loop2] +; IR: loop1: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP1]] [] +; IR: loop2: +; IR-NEXT: store volatile i32 888, ptr addrspace(1) [[OUT]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]] +; IR: TransitionBlock1: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP2:%.*]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2] + +loop1: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop1 [] + +loop2: + store volatile i32 888, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop2 [] +} + define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_nest_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB3_5 +; SI-NEXT: s_cbranch_execz .LBB6_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT: .LBB3_2: ; %outer_loop +; SI-NEXT: .LBB6_2: ; %outer_loop ; SI-NEXT: ; =>This Loop Header: Depth=1 -; SI-NEXT: ; Child Loop BB3_3 Depth 2 +; SI-NEXT: ; Child Loop BB6_3 Depth 2 ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB3_3: ; %inner_loop -; SI-NEXT: ; Parent Loop BB3_2 Depth=1 +; SI-NEXT: .LBB6_3: ; %inner_loop +; SI-NEXT: ; Parent Loop BB6_2 Depth=1 ; SI-NEXT: ; => This Inner Loop Header: Depth=2 ; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1] ; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] @@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: s_cbranch_execnz .LBB6_3 ; SI-NEXT: ; %bb.4: ; %loop.exit.guard -; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1 +; SI-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: s_branch .LBB3_2 -; SI-NEXT: .LBB3_5: ; %UnifiedReturnBlock +; SI-NEXT: s_branch .LBB6_2 +; SI-NEXT: .LBB6_5: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_nest_ret( ; IR-NEXT: entry: @@ -212,4 +349,82 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_nest_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %outer_loop.preheader +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_and_b64 s[0:1], exec, 0 +; SI-NEXT: s_branch .LBB7_3 +; SI-NEXT: .LBB7_2: ; %loop.exit.guard +; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccnz .LBB7_5 +; SI-NEXT: .LBB7_3: ; %outer_loop +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: s_mov_b64 vcc, s[0:1] +; SI-NEXT: s_cbranch_vccz .LBB7_2 +; SI-NEXT: ; %bb.4: ; %TransitionBlock.target.outer_loop +; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_branch .LBB7_2 +; SI-NEXT: .LBB7_5: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_nest_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND1:%.*]] = icmp ne i32 [[TMP]], 1 +; IR-NEXT: [[COND1_32:%.*]] = zext i1 [[COND1]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND1_32]]) +; IR-NEXT: to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: outer_loop: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[INNER_LOOP:%.*]] [] +; IR: inner_loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: [[COND3:%.*]] = icmp eq i32 [[TMP]], 3 +; IR-NEXT: [[COND3_32:%.*]] = zext i1 [[COND3]] to i32 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND3_32]]) +; IR-NEXT: to label [[INNER_LOOP]] [label %outer_loop] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination + %cond1_32 = zext i1 %cond1 to i32 + callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return] + +outer_loop: + ; %cond2 = icmp eq i32 %tmp, 2 + ; br i1 %cond2, label %outer_loop, label %inner_loop + callbr void asm "", ""() to label %inner_loop [] + +inner_loop: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 999, ptr addrspace(1) %out, align 4 + %cond3 = icmp eq i32 %tmp, 3 + %cond3_32 = zext i1 %cond3 to i32 + callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop] + +return: + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 34de1e48bfb59..01bcdad3fc220 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -3,15 +3,16 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA define void @nested_inf_loop(i1 %0, i1 %1) { -; OPT-LABEL: @nested_inf_loop( -; OPT-NEXT: BB: -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: BB1: -; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]] -; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]] -; OPT: infloop: -; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]] -; OPT: DummyReturnBlock: +; OPT-LABEL: define void @nested_inf_loop( +; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]] +; OPT-NEXT: br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]] +; OPT: [[INFLOOP]]: +; OPT-NEXT: br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT: [[DUMMYRETURNBLOCK]]: ; OPT-NEXT: ret void ; ; ISA-LABEL: nested_inf_loop: @@ -63,3 +64,84 @@ BB4: BB3: br label %BB1 } + +define void @nested_inf_loop_callbr(i32 %0, i32 %1) { +; OPT-LABEL: define void @nested_inf_loop_callbr( +; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB1:.*]] [] +; OPT: [[BB1]]: +; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP0]]) +; OPT-NEXT: to label %[[BB3:.*]] [label %BB2] +; OPT: [[BB2:.*:]] +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB4:.*]] [] +; OPT: [[BB4]]: +; OPT-NEXT: br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT: [[TRANSITIONBLOCK]]: +; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP1]]) +; OPT-NEXT: to label %[[BB3]] [label %BB4] +; OPT: [[BB3]]: +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB1]] [] +; OPT: [[DUMMYRETURNBLOCK]]: +; OPT-NEXT: ret void +; +; ISA-LABEL: nested_inf_loop_callbr: +; ISA: ; %bb.0: ; %BB +; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: ; implicit-def: $sgpr6_sgpr7 +; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5 +; ISA-NEXT: .LBB1_1: ; %BB1 +; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; ISA-NEXT: s_and_b64 s[8:9], s[4:5], exec +; ISA-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; ISA-NEXT: .LBB1_2: ; %BB3 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; ISA-NEXT: s_and_b64 s[8:9], s[6:7], exec +; ISA-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; ISA-NEXT: s_branch .LBB1_1 +; ISA-NEXT: .LBB1_3: ; Inline asm indirect target +; ISA-NEXT: ; %BB2 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: ; Label of block must be emitted +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_mov_b64 s[6:7], -1 +; ISA-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; ISA-NEXT: s_cbranch_execz .LBB1_5 +; ISA-NEXT: ; %bb.4: ; %TransitionBlock.target.BB3 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: s_xor_b64 s[6:7], exec, -1 +; ISA-NEXT: .LBB1_5: ; %loop.exit.guard +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_and_b64 vcc, exec, s[6:7] +; ISA-NEXT: s_mov_b64 s[6:7], 0 +; ISA-NEXT: s_cbranch_vccz .LBB1_2 +; ISA-NEXT: ; %bb.6: ; %DummyReturnBlock +; ISA-NEXT: s_setpc_b64 s[30:31] +BB: + callbr void asm "", ""() to label %BB1 [] + +BB1: + callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2] + +BB2: + callbr void asm "", ""() to label %BB4 [] + +BB4: + callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4] + +BB3: + callbr void asm "", ""() to label %BB1 [] +} diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 4cbe682cf9f9f..004c27971131d 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s declare void @llvm.trap() @@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 - - +; UNIFY-LABEL: @kernel( +; UNIFY-NEXT: entry: +; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; UNIFY: if.then: +; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT: br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]] +; UNIFY: cond.false: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.else: +; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT: br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]] +; UNIFY: if.then3: +; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT: br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]] +; UNIFY: cond.false.i8: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.end6.sink.split: +; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT: br label [[IF_END6]] +; UNIFY: if.end6: +; UNIFY-NEXT: ret void +; entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 @@ -105,5 +130,129 @@ if.end6.sink.split: if.end6: ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; UNIFY: {{.*}} + +define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { +; CHECK-LABEL: kernel_callbr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s1, s[8:9], 0x10 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmpk_eq_i32 s1, 0x100 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %if.then +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: .LBB1_2: ; %if.end6.sink.split +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dword v0, v1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: .LBB1_3: ; Inline asm indirect target +; CHECK-NEXT: ; %UnifiedReturnBlock +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB1_4: ; Inline asm indirect target +; CHECK-NEXT: ; %if.else +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.5: ; %if.then3 +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_2 +; CHECK-NEXT: .LBB1_6: ; Inline asm indirect target +; CHECK-NEXT: ; %cond.false.i8 +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: .LBB1_7: ; Inline asm indirect target +; CHECK-NEXT: ; %cond.false +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_trap 2 +; CHECK-NEXT: ; divergent unreachable +; CHECK-NEXT: s_branch .LBB1_3 +; UNIFY-LABEL: @kernel_callbr( +; UNIFY-NEXT: entry: +; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT: [[CMP32:%.*]] = zext i1 [[CMP]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP32]]) +; UNIFY-NEXT: to label [[IF_THEN:%.*]] [label %if.else] +; UNIFY: if.then: +; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT: [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_32]]) +; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false] +; UNIFY: cond.false: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.else: +; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT: [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP2_32]]) +; UNIFY-NEXT: to label [[IF_THEN3:%.*]] [label %if.end6] +; UNIFY: if.then3: +; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT: [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]]) +; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8] +; UNIFY: cond.false.i8: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.end6.sink.split: +; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT: callbr void asm "", ""() +; UNIFY-NEXT: to label [[IF_END6:%.*]] [] +; UNIFY: if.end6: +; UNIFY-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cmp = icmp eq i32 %n, 256 + %cmp32 = zext i1 %cmp to i32 + callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else] + +if.then: + %cmp1 = icmp eq i32 %a, 0 + %cmp1_32 = zext i1 %cmp1 to i32 + callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false] + +cond.false: + call void @llvm.trap() + unreachable + +if.else: + %cmp2 = icmp ult i32 %tid, 10 + %cmp2_32 = zext i1 %cmp2 to i32 + callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6] + +if.then3: + %cmp1.i7 = icmp eq i32 %a, 0 + %cmp1.i7_32 = zext i1 %cmp1.i7 to i32 + callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8] + +cond.false.i8: + call void @llvm.trap() + unreachable + +if.end6.sink.split: + %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid + store i32 %a, ptr addrspace(1) %x1, align 4 + callbr void asm "", ""() to label %if.end6 [] + +if.end6: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll index 50666bee325e8..684dc1a1f0092 100644 --- a/llvm/test/CodeGen/AMDGPU/update-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll @@ -37,3 +37,42 @@ n28: ; preds = %.loopexit, %n28 n31: ; preds = ret void } + +define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 { +; IR-LABEL: @_amdgpu_ps_main_callbr( +; IR-NEXT: .entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[DOTLOOPEXIT:%.*]] [] +; IR: .loopexit: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[N28:%.*]] [] +; IR: n28: +; IR-NEXT: [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ] +; IR-NEXT: [[N29]] = fadd float [[DOT01]], 1.000000e+00 +; IR-NEXT: [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00 +; IR-NEXT: [[N30_32:%.*]] = zext i1 [[N30]] to i32 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[N30_32]]) +; IR-NEXT: to label [[DOTLOOPEXIT]] [label %n28] +; IR: n31: +; IR-NEXT: ret void +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +.entry: + callbr void asm "", ""() to label %.loopexit [] + +.loopexit: ; preds = %n28, %.entry + callbr void asm "", ""() to label %n28 [] + +n28: ; preds = %.loopexit, %n28 + %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ] + %n29 = fadd float %.01, 1.0 + %n30 = fcmp ogt float %n29, 4.000000e+00 + %n30.32 = zext i1 %n30 to i32 + callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28] + +n31: ; preds = + ret void +} diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll new file mode 100644 index 0000000000000..42f95194980d4 --- /dev/null +++ b/llvm/test/Transforms/StructurizeCFG/callbr.ll @@ -0,0 +1,235 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s + +; Structurize as usual, but don't tear callbr and its destination blocks apart. +; +; Note: currently, callbr blocks and their corresponding target blocks +; themselves are not handled by the structurizer.* If the CFG turns out to be +; unstructured at the end, the CFG lowering (si-annotate-control-flow) will +; detect this. For the currently intended use cases of callbr in the context of +; the AMDGPU backend, this is not a limitation (cf. +; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087). +; +; Note 2: while callbr and its targets remain untouched, everything else is +; handled as usual, even if it is nested in a callbr region. +; +; *FIXME: this will be fixed in the future. Callbr can be handled as follows: +; Input IR: +; ``` +; define void @foo_callbr() { +; callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...] +; fallthrough: +; br label %exit +; indirect: +; br label %exit +; ... +; exit: +; ret void +; } +; ``` +; +; Output IR: +; ``` +; define void @foo_callbr() { +; callbr void asm "", "!i"() +; to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...] +; fake.indirect: ; preds = %0 +; br label %Flow +; fake.indirect1: ; preds = %0 +; br label %Flow +; fake.indirect2: ; preds = %0 +; br label %Flow +; ... +; Flow: ; preds = %fallthrough, %fake.indirect[0-N] +; %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ] +; br i1 %1, label %indirect, label %Flow1 +; Flow1: ; preds = %Flow, %indirect +; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ] +; br i1 %2, label %indirect1, label %Flow2 +; Flow2: ; preds = %Flow, %indirect1 +; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ] +; br i1 %2, label %indirect2, label %Flow3 +; ... +; fallthrough: ; preds = %0 +; br label %Flow +; indirect: ; preds = %Flow +; br label %Flow1 +; indirect1: ; preds = %Flow1 +; br label %Flow2 +; indirect2: : preds = %Flow2 +; br label %Flow3 +; ... +; exit: ; preds = %indirectN, %FlowN +; ret void +; } +; ``` +; +; Output IR as ASCII-art: +; %0 +; --------------------- +; | | | | +; v v v v +; f f.i f.i1 f.i2 +; | | | | +; v v v v +; --------------------- +; %Flow +; | \ +; | %indirect +; | / +; %Flow1 +; | \ +; | %indirect1 +; | / +; %Flow2 +; | \ +; | %indirect2 +; | / +; %exit +; + +; Only callbr, nothing to do. +define void @callbr_simple() { +; CHECK-LABEL: define void @callbr_simple() { +; CHECK-NEXT: [[CALLBR:.*:]] +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br label %exit +indirect: + br label %exit +exit: + ret void +} + +; Callbr nested in non-callbr: non-callbr is transformed +define void @callbr_in_non_callbr(i1 %c) { +; CHECK-LABEL: define void @callbr_in_non_callbr( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]] +; CHECK: [[CALLBR]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[NOCALLBR]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + br i1 %c, label %callbr, label %nocallbr +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br label %exit +indirect: + br label %exit +nocallbr: + br label %exit +exit: + ret void +} + +; Callbr parent of non-callbr: non-callbr is transformed +define void @non_callbr_in_callbr(i1 %c) { +; CHECK-LABEL: define void @non_callbr_in_callbr( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]] +; CHECK: [[FALLTHROUGH1]]: +; CHECK-NEXT: br label %[[FLOW1]] +; CHECK: [[FALLTHROUGH2]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[FLOW1]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %c, label %fallthrough1, label %fallthrough2 +fallthrough1: + br label %exit +fallthrough2: + br label %exit +indirect: + br label %exit +exit: + ret void +} + +; Callbr surrounded by non-callbr: all three regular branches are handled +; correctly +define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) { +; CHECK-LABEL: define void @callbr_nested_in_non_callbr( +; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]] +; CHECK: [[FLOW3]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]] +; CHECK: [[CALLBR]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]] +; CHECK: [[FALLTHROUGH1]]: +; CHECK-NEXT: br label %[[FLOW2]] +; CHECK: [[INDIRECT2:.*:]] +; CHECK-NEXT: br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]] +; CHECK: [[INDIRECT1]]: +; CHECK-NEXT: br label %[[FLOW1]] +; CHECK: [[NOCALLBR]]: +; CHECK-NEXT: br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]] +; CHECK: [[NOCALLBR1]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: br label %[[FLOW3]] +; CHECK: [[FLOW1]]: +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[FLOW2]]: +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; + br i1 %c, label %callbr, label %nocallbr +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %d, label %fallthrough1, label %ret +fallthrough1: + br label %ret +indirect: + br i1 %e, label %indirect1, label %ret +indirect1: + br label %ret +nocallbr: + br i1 %f, label %nocallbr1, label %ret +nocallbr1: + br label %ret +ret: + ret void +} From 4f618636ddb4938bb91816b66250edc755cdc7d1 Mon Sep 17 00:00:00 2001 From: SKill Date: Mon, 3 Nov 2025 16:12:15 +0100 Subject: [PATCH 017/313] [clang] Optimize SourceManager.getSpellingLocSlowCase and SourceManager.getFileLocSlowCase (#164269) Optimize implementations of `getSpellingLocSlowCase` and `getFileLocSlowCase` by inlining called methods to avoid repeated calls to `getSLocEntry` and `getFileID`. a performance improvement follow up for #160667 --- clang/lib/Basic/SourceManager.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 938c6485125ee..97aa0f2aa59b9 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -907,19 +907,23 @@ getExpansionLocSlowCase(SourceLocation Loc) const { SourceLocation SourceManager::getSpellingLocSlowCase(SourceLocation Loc) const { do { - FileIDAndOffset LocInfo = getDecomposedLoc(Loc); - Loc = getSLocEntry(LocInfo.first).getExpansion().getSpellingLoc(); - Loc = Loc.getLocWithOffset(LocInfo.second); + const SLocEntry &Entry = getSLocEntry(getFileID(Loc)); + Loc = Entry.getExpansion().getSpellingLoc().getLocWithOffset( + Loc.getOffset() - Entry.getOffset()); } while (!Loc.isFileID()); return Loc; } SourceLocation SourceManager::getFileLocSlowCase(SourceLocation Loc) const { do { - if (isMacroArgExpansion(Loc)) - Loc = getImmediateSpellingLoc(Loc); - else - Loc = getImmediateExpansionRange(Loc).getBegin(); + const SLocEntry &Entry = getSLocEntry(getFileID(Loc)); + const ExpansionInfo &ExpInfo = Entry.getExpansion(); + if (ExpInfo.isMacroArgExpansion()) { + Loc = ExpInfo.getSpellingLoc().getLocWithOffset(Loc.getOffset() - + Entry.getOffset()); + } else { + Loc = ExpInfo.getExpansionLocStart(); + } } while (!Loc.isFileID()); return Loc; } From 5a11eea58193ed30f1a5a126d33d2627026c1451 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Mon, 3 Nov 2025 19:18:41 +0400 Subject: [PATCH 018/313] [clang] Add sections to C++ DR status page (#165749) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following [recent discussion](http://lists.isocpp.org/core/2025/10/18637.php) on CWG reflector, `cwg_index.html` now contains stable name of the section for each Core issue. I thought that this would be a useful information to have on our C++ DR status page for that handful of experts who actually open it, so here we are. This PR consists of 3 parts: 1. Rewrite of a small routine that parses `cwg_index.html` from splits and indicies to a single regular expression with named groups, adding section information. 2. Changes to the rest of the `make_cxx_dr_status` to accommodate for the first part. 3. Regenerated `cxx_dr_status.html`. Ideally this PR would only add lines to `cxx_dr_status.html`, but previously we've been leaving some newlines in issue titles (which never affected how this page is rendered), which are now properly replaced with whitespaces — hence a couple of deletions in that file. --------- Co-authored-by: Richard Smith --- clang/www/cxx_dr_status.html | 3077 +++++++++++++++++++++++++++++++++- clang/www/make_cxx_dr_status | 72 +- 2 files changed, 3109 insertions(+), 40 deletions(-) diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 0312c9dfc0665..e9fadb2dbd4ac 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -40,2843 +40,3314 @@

C++ defect report implementation status

This page tracks which C++ defect reports are implemented within Clang.

- +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status index 485a9a56267ca..3ba12e13a7354 100755 --- a/clang/www/make_cxx_dr_status +++ b/clang/www/make_cxx_dr_status @@ -10,26 +10,36 @@ output = os.path.join(clang_www_dir, 'cxx_dr_status.html') dr_test_dir = os.path.join(clang_www_dir, '../test/CXX/drs') class DR: - def __init__(self, section, issue, url, status, title): - self.section, self.issue, self.url, self.status, self.title = \ - section, issue, url, status, title + def __init__(self, *, section_number, section_name, section_link, number, url, status, liaison, title): + self.section_number, self.section_name, self.section_link, self.number, self.url, self.status, self.liaison, self.title = \ + section_number, section_name, section_link, number, url, status, liaison, title def __repr__(self): - return '%s (%s): %s' % (self.issue, self.status, self.title) - -def parse(dr): - try: - section, issue_link, status, liaison, title = [ - col.split('>', 1)[1].split('')[0] - for col in dr.split('', 1)[0].split('', 1)[1].split('<', 1)[0]) - title = title.replace('', '').replace('', '').replace('\r\n', '\n').strip() - return DR(section, issue, url, status, title) + return '%s (%s): %s' % (self.number, self.status, self.title) + + pattern = re.compile(''' +(?P.*) (?P.*) + +(?P.*) +(?P.*) +(?P.*) +(?P[\\w\\W]*)</issue_title></TD> +</TR>''') + + @classmethod + def parse_from_html(cls, html_string): + match = cls.pattern.match(html_string) + if match is None: + print(f"Parse error: {html_string}", file=sys.stderr) + exit(1) + return cls( + section_number=match.group('section_number'), + section_name=match.group('section_name'), + section_link=match.group('section_link'), + number=int(match.group('number')), + url=match.group('url'), + status=match.group('status'), + liaison=match.group('liaison'), + title=match.group('title').replace('\n', ' ').strip()) def collect_tests(): status_re = re.compile(r'\bcwg([0-9]+): (.*)') @@ -68,8 +78,8 @@ def get_issues(path): print(ex, file=sys.stderr) sys.exit(1) - return sorted((parse(dr) for dr in buffer.split('<TR>')[2:]), - key = lambda dr: dr.issue) + return sorted((DR.parse_from_html(dr) for dr in buffer.split('<TR>')[2:]), + key = lambda dr: dr.number) issue_list_path = None @@ -127,9 +137,10 @@ out_html.append('''\ <p>This page tracks which C++ defect reports are implemented within Clang.</p> -<table width="689" border="1" cellspacing="0"> +<table width="892" border="1" cellspacing="0"> <tr> <th>Number</th> + <th>Section</th> <th>Status</th> <th>Issue title</th> <th>Available in Clang?</th> @@ -149,7 +160,7 @@ def availability(issue): unresolved_status = unresolved_status_match.group(1) proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready|ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status) if proposed_resolution_match is None: - raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status)) + raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.number, unresolved_status)) proposed_resolution = proposed_resolution_match.group(2) status = status[:-1-len(proposed_resolution)] status = status[:-1-len(unresolved_status)] @@ -236,7 +247,7 @@ def availability(issue): avail = 'Duplicate of <a href="#%s">%s</a>' % (dup, dup) _, avail_style, _, _ = availability(dup) else: - raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.issue)) + raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.number)) return (avail + avail_suffix, avail_style, unresolved_status, details) count = {} @@ -254,7 +265,7 @@ for dr in drs: elif dr.status in ('open', 'drafting', 'review', 'tentatively ready', 'ready'): row_style = ' class="open"' try: - avail, avail_style, unresolved_status, details = availability(dr.issue) + avail, avail_style, unresolved_status, details = availability(dr.number) except AvailabilityError as e: availability_error_occurred = True print(e.args[0]) @@ -267,12 +278,12 @@ for dr in drs: if unresolved_status != dr.status: availability_error_occurred = True print("error: issue %s is marked '%s', which differs from CWG index status '%s'" \ - % (dr.issue, unresolved_status, dr.status)) + % (dr.number, unresolved_status, dr.status)) continue else: row_style = '' try: - avail, avail_style, unresolved_status, details = availability(dr.issue) + avail, avail_style, unresolved_status, details = availability(dr.number) except AvailabilityError as e: availability_error_occurred = True print(e.args[0]) @@ -281,7 +292,7 @@ for dr in drs: if unresolved_status: availability_error_occurred = True print("error: issue %s is marked '%s', even though it is resolved in CWG index" \ - % (dr.issue, unresolved_status)) + % (dr.number, unresolved_status)) continue if not avail.startswith('Sup') and not avail.startswith('Dup'): @@ -297,8 +308,9 @@ for dr in drs: {details} </details>''' out_html.append(f''' - <tr{row_style} id="{dr.issue}"> - <td><a href="https://cplusplus.github.io/CWG/issues/{dr.issue}.html">{dr.issue}</a></td> + <tr{row_style} id="{dr.number}"> + <td><a href="https://cplusplus.github.io/CWG/issues/{dr.number}.html">{dr.number}</a></td> + <td>[<a href="{dr.section_link}">{dr.section_name}</a>]</td> <td>{dr.status}</td> <td>{dr.title}</td> <td{avail_style} align="center">{avail}</td> From bb9bd5f263226840194b28457ddf9861986db51f Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" <jdenny.ornl@gmail.com> Date: Mon, 3 Nov 2025 10:19:12 -0500 Subject: [PATCH 019/313] [LoopUnroll] Fix assert fail on zeroed branch weights (#165938) BranchProbability fails an assert when its denominator is zero. Reported at <https://github.com/llvm/llvm-project/pull/159163#pullrequestreview-3406318423>. --- llvm/lib/Transforms/Utils/LoopUtils.cpp | 5 +++- .../LoopUnroll/zeroed-branch-weights.ll | 30 +++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 8be471bee5579..6e60b94be78e3 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -992,9 +992,12 @@ BranchProbability llvm::getBranchProbability(BranchInst *B, uint64_t Weight0, Weight1; if (!extractBranchWeights(*B, Weight0, Weight1)) return BranchProbability::getUnknown(); + uint64_t Denominator = Weight0 + Weight1; + if (Denominator == 0) + return BranchProbability::getUnknown(); if (!ForFirstTarget) std::swap(Weight0, Weight1); - return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1); + return BranchProbability::getBranchProbability(Weight0, Denominator); } bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, diff --git a/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll new file mode 100644 index 0000000000000..4d378b0d22f7d --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll @@ -0,0 +1,30 @@ +; Check that zeroed branch weights do not crash or otherwise break basic +; LoopUnroll behavior when it tries to compute a probability from them. + +; RUN: opt < %s -S -unroll-count=2 -passes='loop-unroll' 2>&1 | FileCheck %s + +define void @test() { +entry: + br label %loop + +loop: + br i1 false, label %end, label %loop, !prof !0 + +end: + ret void +} + +!0 = !{!"branch_weights", i32 0, i32 0} + +; CHECK: define void @test() { +; CHECK: entry: +; CHECK: br label %loop +; CHECK: loop: +; CHECK: br i1 false, label %end, label %loop.1, !prof !0 +; CHECK: loop.1: +; CHECK: br i1 false, label %end, label %loop, !prof !0, !llvm.loop !1 +; CHECK-NOT: loop.2 +; CHECK: end: +; CHECK: ret void +; CHECK: } +; CHECK: !0 = !{!"branch_weights", i32 0, i32 0} From 24c22b7de620669aed9da28de323309c44a58244 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com> Date: Mon, 3 Nov 2025 09:27:45 -0600 Subject: [PATCH 020/313] [flang][OpenMP] Sort and move macro-based clause checks to the end, NFC (#166175) --- flang/lib/Semantics/check-omp-structure.cpp | 206 ++++++++++---------- 1 file changed, 100 insertions(+), 106 deletions(-) diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index e7e3f4d886b34..d7db15dd37949 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -195,30 +195,6 @@ void OmpStructureChecker::Leave(const parser::ExecutionPart &) { partStack_.pop_back(); } -// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'. -#define CHECK_SIMPLE_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \ - CheckAllowedClause(llvm::omp::Clause::Y); \ - } - -#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ - CheckAllowedClause(llvm::omp::Clause::Y); \ - RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \ - } - -#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ - CheckAllowedClause(llvm::omp::Clause::Y); \ - RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \ - } - -// Use when clause don't falls under 'struct OmpClause' in 'parse-tree.h'. -#define CHECK_SIMPLE_PARSER_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::X &) { \ - CheckAllowedClause(llvm::omp::Y); \ - } - // 'OmpWorkshareBlockChecker' is used to check the validity of the assignment // statements and the expressions enclosed in an OpenMP Workshare construct class OmpWorkshareBlockChecker { @@ -3391,88 +3367,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Sizes &c) { /*paramName=*/"parameter", /*allowZero=*/false); } -// Following clauses do not have a separate node in parse-tree.h. -CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent) -CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity) -CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture) -CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains) -CHECK_SIMPLE_CLAUSE(Default, OMPC_default) -CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj) -CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type) -CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule) -CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate) -CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive) -CHECK_SIMPLE_CLAUSE(Final, OMPC_final) -CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush) -CHECK_SIMPLE_CLAUSE(Full, OMPC_full) -CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize) -CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id) -CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset) -CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds) -CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive) -CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer) -CHECK_SIMPLE_CLAUSE(Match, OMPC_match) -CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal) -CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks) -CHECK_SIMPLE_CLAUSE(Order, OMPC_order) -CHECK_SIMPLE_CLAUSE(Read, OMPC_read) -CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate) -CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate) -CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads) -CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset) -CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch) -CHECK_SIMPLE_CLAUSE(Link, OMPC_link) -CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect) -CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable) -CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp) -CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines) -CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs) -CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism) -CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup) -CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch) -CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial) -CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind) -CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd) -CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation) -CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform) -CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown) -CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied) -CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators) -CHECK_SIMPLE_CLAUSE(Write, OMPC_write) -CHECK_SIMPLE_CLAUSE(Init, OMPC_init) -CHECK_SIMPLE_CLAUSE(Use, OMPC_use) -CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants) -CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext) -CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity) -CHECK_SIMPLE_CLAUSE(Message, OMPC_message) -CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter) -CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise) -CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args) -CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args) -CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order) -CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind) -CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare) -CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute) -CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak) -CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel) -CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire) -CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed) -CHECK_SIMPLE_CLAUSE(Release, OMPC_release) -CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable) -CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent) -CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst) -CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail) - -CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams) -CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads) -CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem) -CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority) -CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit) - -CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse) -CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen) -CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen) - void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) { context_.Say(GetContext().clauseSource, "LOOPRANGE clause is not implemented yet"_err_en_US, @@ -5545,4 +5439,104 @@ void OmpStructureChecker::CheckAllowedRequiresClause(llvmOmpClause clause) { } } +// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'. +#define CHECK_SIMPLE_CLAUSE(X, Y) \ + void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \ + CheckAllowedClause(llvm::omp::Clause::Y); \ + } + +#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \ + void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ + CheckAllowedClause(llvm::omp::Clause::Y); \ + RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \ + } + +#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \ + void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ + CheckAllowedClause(llvm::omp::Clause::Y); \ + RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \ + } + +// Following clauses do not have a separate node in parse-tree.h. +CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent) +CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel) +CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire) +CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args) +CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity) +CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args) +CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind) +CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture) +CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare) +CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains) +CHECK_SIMPLE_CLAUSE(Default, OMPC_default) +CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj) +CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type) +CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule) +CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate) +CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive) +CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail) +CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter) +CHECK_SIMPLE_CLAUSE(Final, OMPC_final) +CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush) +CHECK_SIMPLE_CLAUSE(Full, OMPC_full) +CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize) +CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id) +CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset) +CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate) +CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds) +CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch) +CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive) +CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect) +CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer) +CHECK_SIMPLE_CLAUSE(Init, OMPC_init) +CHECK_SIMPLE_CLAUSE(Link, OMPC_link) +CHECK_SIMPLE_CLAUSE(Match, OMPC_match) +CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order) +CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable) +CHECK_SIMPLE_CLAUSE(Message, OMPC_message) +CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext) +CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup) +CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal) +CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs) +CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp) +CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines) +CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism) +CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch) +CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants) +CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks) +CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute) +CHECK_SIMPLE_CLAUSE(Order, OMPC_order) +CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise) +CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial) +CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation) +CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind) +CHECK_SIMPLE_CLAUSE(Read, OMPC_read) +CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed) +CHECK_SIMPLE_CLAUSE(Release, OMPC_release) +CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable) +CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst) +CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity) +CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd) +CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate) +CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset) +CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads) +CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent) +CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform) +CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown) +CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied) +CHECK_SIMPLE_CLAUSE(Use, OMPC_use) +CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators) +CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak) +CHECK_SIMPLE_CLAUSE(Write, OMPC_write) + +CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams) +CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads) +CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem) +CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority) +CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit) + +CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse) +CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen) +CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen) + } // namespace Fortran::semantics From 47c54d55c9fac5ea7c87881e00f96e8c12b18174 Mon Sep 17 00:00:00 2001 From: Matthew Nagy <matthew.nagy@sony.com> Date: Mon, 3 Nov 2025 15:37:26 +0000 Subject: [PATCH 021/313] [UBSan] Improve error message when a misalignment is due to target default assumed alignment --- clang/lib/CodeGen/CGExprCXX.cpp | 21 ++++++++--- clang/lib/CodeGen/CodeGenFunction.h | 5 ++- compiler-rt/lib/ubsan/ubsan_checks.inc | 1 + compiler-rt/lib/ubsan/ubsan_handlers.cpp | 33 +++++++++++++---- .../TestCases/TypeCheck/minimum-alignment.cpp | 36 +++++++++++++++++++ .../ubsan/TestCases/TypeCheck/misaligned.cpp | 2 +- 6 files changed, 86 insertions(+), 12 deletions(-) create mode 100644 compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index 14d8db32bafc6..f2dd22e9bed3b 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -18,6 +18,9 @@ #include "ConstantEmitter.h" #include "TargetInfo.h" #include "clang/Basic/CodeGenOptions.h" +#include "clang/Basic/Sanitizers.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" #include "clang/CodeGen/CGFunctionInfo.h" #include "llvm/IR/Intrinsics.h" @@ -1749,6 +1752,17 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) { allocator->isReservedGlobalPlacementOperator()) result = Builder.CreateLaunderInvariantGroup(result); + // Check the default alignment of the type and why. Users may incorrectly + // return misaligned memory from a replaced operator new without knowing + // about default alignment. + TypeCheckKind checkKind = CodeGenFunction::TCK_ConstructorCall; + const TargetInfo &TI = getContext().getTargetInfo(); + unsigned DefaultTargetAlignment = TI.getNewAlign() / TI.getCharWidth(); + if (SanOpts.has(SanitizerKind::Alignment) && + (DefaultTargetAlignment > + CGM.getContext().getTypeAlignInChars(allocType).getQuantity())) + checkKind = CodeGenFunction::TCK_ConstructorCallMinimumAlign; + // Emit sanitizer checks for pointer value now, so that in the case of an // array it was checked only once and not at each constructor call. We may // have already checked that the pointer is non-null. @@ -1756,10 +1770,9 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) { // we'll null check the wrong pointer here. SanitizerSet SkippedChecks; SkippedChecks.set(SanitizerKind::Null, nullCheck); - EmitTypeCheck(CodeGenFunction::TCK_ConstructorCall, - E->getAllocatedTypeSourceInfo()->getTypeLoc().getBeginLoc(), - result, allocType, result.getAlignment(), SkippedChecks, - numElements); + EmitTypeCheck( + checkKind, E->getAllocatedTypeSourceInfo()->getTypeLoc().getBeginLoc(), + result, allocType, result.getAlignment(), SkippedChecks, numElements); EmitNewInitializer(*this, E, allocType, elementTy, result, numElements, allocSizeWithoutCookie); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 8c4c1c8c2dc95..047ca844c79de 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3296,7 +3296,10 @@ class CodeGenFunction : public CodeGenTypeCache { TCK_NonnullAssign, /// Checking the operand of a dynamic_cast or a typeid expression. Must be /// null or an object within its lifetime. - TCK_DynamicOperation + TCK_DynamicOperation, + /// Checking the 'this' poiner for a constructor call, including that the + /// alignment is greater or equal to the targets minimum alignment + TCK_ConstructorCallMinimumAlign }; /// Determine whether the pointer type check \p TCK permits null pointers. diff --git a/compiler-rt/lib/ubsan/ubsan_checks.inc b/compiler-rt/lib/ubsan/ubsan_checks.inc index b1d09a9024e7e..f8757d781afb8 100644 --- a/compiler-rt/lib/ubsan/ubsan_checks.inc +++ b/compiler-rt/lib/ubsan/ubsan_checks.inc @@ -28,6 +28,7 @@ UBSAN_CHECK(NullptrAfterNonZeroOffset, "nullptr-after-nonzero-offset", UBSAN_CHECK(PointerOverflow, "pointer-overflow", "pointer-overflow") UBSAN_CHECK(MisalignedPointerUse, "misaligned-pointer-use", "alignment") UBSAN_CHECK(AlignmentAssumption, "alignment-assumption", "alignment") +UBSAN_CHECK(MinumumAssumedAlignment, "minimum-assumed-alignment", "alignment") UBSAN_CHECK(InsufficientObjectSize, "insufficient-object-size", "object-size") UBSAN_CHECK(SignedIntegerOverflow, "signed-integer-overflow", "signed-integer-overflow") diff --git a/compiler-rt/lib/ubsan/ubsan_handlers.cpp b/compiler-rt/lib/ubsan/ubsan_handlers.cpp index 63319f46734a4..fc6063af4562b 100644 --- a/compiler-rt/lib/ubsan/ubsan_handlers.cpp +++ b/compiler-rt/lib/ubsan/ubsan_handlers.cpp @@ -73,14 +73,26 @@ enum TypeCheckKind { TCK_NonnullAssign, /// Checking the operand of a dynamic_cast or a typeid expression. Must be /// null or an object within its lifetime. - TCK_DynamicOperation + TCK_DynamicOperation, + /// Checking the 'this' poiner for a constructor call, including that the + /// alignment is greater or equal to the targets minimum alignment + TCK_ConstructorCallMinimumAlign }; extern const char *const TypeCheckKinds[] = { - "load of", "store to", "reference binding to", "member access within", - "member call on", "constructor call on", "downcast of", "downcast of", - "upcast of", "cast to virtual base of", "_Nonnull binding to", - "dynamic operation on"}; + "load of", + "store to", + "reference binding to", + "member access within", + "member call on", + "constructor call on", + "downcast of", + "downcast of", + "upcast of", + "cast to virtual base of", + "_Nonnull binding to", + "dynamic operation on", + "constructor call with pointer from operator new on"}; } static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer, @@ -94,7 +106,9 @@ static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer, ? ErrorType::NullPointerUseWithNullability : ErrorType::NullPointerUse; else if (Pointer & (Alignment - 1)) - ET = ErrorType::MisalignedPointerUse; + ET = (Data->TypeCheckKind == TCK_ConstructorCallMinimumAlign) + ? ErrorType::MinumumAssumedAlignment + : ErrorType::MisalignedPointerUse; else ET = ErrorType::InsufficientObjectSize; @@ -117,6 +131,13 @@ static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer, Diag(Loc, DL_Error, ET, "%0 null pointer of type %1") << TypeCheckKinds[Data->TypeCheckKind] << Data->Type; break; + case ErrorType::MinumumAssumedAlignment: + Diag(Loc, DL_Error, ET, + "%0 misaligned address %1 for type %2, " + "which requires target minimum assumed %3 byte alignment") + << TypeCheckKinds[Data->TypeCheckKind] << (void *)Pointer << Data->Type + << Alignment; + break; case ErrorType::MisalignedPointerUse: Diag(Loc, DL_Error, ET, "%0 misaligned address %1 for type %3, " "which requires %2 byte alignment") diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp new file mode 100644 index 0000000000000..4642126ab74c4 --- /dev/null +++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp @@ -0,0 +1,36 @@ +// RUN: %clangxx %gmlt -fsanitize=alignment %s -o %t +// RUN: %run %t 2>&1 | FileCheck %s + +// UNSUPPORTED: i386 +// UNSUPPORTED: armv7l + +// These sanitizers already overload the new operator so won't compile this test +// UNSUPPORTED: ubsan-msan +// UNSUPPORTED: ubsan-tsan + +#include <cassert> +#include <cstdlib> + +void *operator new(std::size_t count) { + constexpr const size_t offset = 8; + + // allocate a bit more so we can safely offset it + void *ptr = std::malloc(count + offset); + + // verify malloc returned 16 bytes aligned mem + static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ == 16); + assert(((std::ptrdiff_t)ptr & (__STDCPP_DEFAULT_NEW_ALIGNMENT__ - 1)) == 0); + + return (char *)ptr + offset; +} + +struct Foo { + void *_cookie1, *_cookie2; +}; + +static_assert(alignof(Foo) == 8); +int main() { + // CHECK: runtime error: constructor call with pointer from operator new on misaligned address 0x{{.*}} for type 'Foo', which requires target minimum assumed 16 byte alignment + Foo *f = new Foo; + return 0; +} diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp index e39a0ab4e6589..4b0b2b5923c6f 100644 --- a/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp +++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp @@ -101,7 +101,7 @@ int main(int, char **argv) { return s->f() && 0; case 'n': - // CHECK-NEW: misaligned.cpp:[[@LINE+4]]{{(:21)?}}: runtime error: constructor call on misaligned address [[PTR:0x[0-9a-f]*]] for type 'S', which requires 4 byte alignment + // CHECK-NEW: misaligned.cpp:[[@LINE+4]]{{(:21)?}}: runtime error: constructor call with pointer from operator new on misaligned address [[PTR:0x[0-9a-f]*]] for type 'S', which requires target minimum assumed 4 byte alignment // CHECK-NEW-NEXT: [[PTR]]: note: pointer points here // CHECK-NEW-NEXT: {{^ 00 00 00 01 02 03 04 05}} // CHECK-NEW-NEXT: {{^ \^}} From 613c6de977d2917ef9f6c3d14522656f31448f4d Mon Sep 17 00:00:00 2001 From: Timm Baeder <tbaeder@redhat.com> Date: Mon, 3 Nov 2025 16:42:34 +0100 Subject: [PATCH 022/313] [clang] Adjust TextDiagnostic style ranges for interesting source region (#164941) After: <img width="1904" height="186" alt="Screenshot From 2025-10-24 09-59-40" src="https://github.com/user-attachments/assets/c860227f-50c5-4afe-a959-83e3452fc72d" /> <img width="1366" height="204" alt="Screenshot From 2025-10-24 09-59-12" src="https://github.com/user-attachments/assets/450bffec-b4b2-465c-b435-bddf8ebdbd32" /> <img width="1310" height="204" alt="Screenshot From 2025-10-24 09-58-53" src="https://github.com/user-attachments/assets/8015ec6f-e032-4f0b-b55c-b2c718d14f6b" /> --- clang/lib/Frontend/TextDiagnostic.cpp | 76 ++++++++++++++----- ...diags-interesting-source-region-colors.cpp | 30 ++++++++ 2 files changed, 87 insertions(+), 19 deletions(-) create mode 100644 clang/test/Frontend/diags-interesting-source-region-colors.cpp diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp index aea3e72d92a84..10032184b5d94 100644 --- a/clang/lib/Frontend/TextDiagnostic.cpp +++ b/clang/lib/Frontend/TextDiagnostic.cpp @@ -349,14 +349,13 @@ struct SourceColumnMap { /// When the source code line we want to print is too long for /// the terminal, select the "interesting" region. -static void selectInterestingSourceRegion(std::string &SourceLine, - std::string &CaretLine, - std::string &FixItInsertionLine, - Columns NonGutterColumns, - const SourceColumnMap &Map) { - Columns CaretColumns = Columns(CaretLine.size()); - Columns FixItColumns = - Columns(llvm::sys::locale::columnWidth(FixItInsertionLine)); +static void selectInterestingSourceRegion( + std::string &SourceLine, std::string &CaretLine, + std::string &FixItInsertionLine, Columns NonGutterColumns, + const SourceColumnMap &Map, + SmallVectorImpl<clang::TextDiagnostic::StyleRange> &Styles) { + Columns CaretColumns = CaretLine.size(); + Columns FixItColumns = llvm::sys::locale::columnWidth(FixItInsertionLine); Columns MaxColumns = std::max({Map.columns().V, CaretColumns.V, FixItColumns.V}); // if the number of columns is less than the desired number we're done @@ -369,13 +368,11 @@ static void selectInterestingSourceRegion(std::string &SourceLine, // Find the slice that we need to display the full caret line // correctly. Columns CaretStart = 0, CaretEnd = CaretLine.size(); - for (; CaretStart != CaretEnd; CaretStart = CaretStart.next()) - if (!isWhitespace(CaretLine[CaretStart.V])) - break; + while (CaretStart != CaretEnd && isWhitespace(CaretLine[CaretStart.V])) + CaretStart = CaretStart.next(); - for (; CaretEnd != CaretStart; CaretEnd = CaretEnd.prev()) - if (!isWhitespace(CaretLine[CaretEnd.V - 1])) - break; + while (CaretEnd != CaretStart && isWhitespace(CaretLine[CaretEnd.V])) + CaretEnd = CaretEnd.prev(); // caret has already been inserted into CaretLine so the above whitespace // check is guaranteed to include the caret @@ -516,13 +513,45 @@ static void selectInterestingSourceRegion(std::string &SourceLine, assert(FrontColumnsRemoved + ColumnsKept + BackColumnsRemoved > NonGutterColumns); + // Since we've modified the SourceLine, we also need to adjust the line's + // highlighting information. In particular, if we've removed + // from the front of the line, we need to move the style ranges to the + // left and remove unneeded ranges. + // Note in particular that variables like CaretEnd are defined in the + // CaretLine, which only contains ASCII, while the style ranges are defined in + // the source line, where we have to care for the byte-index != column-index + // case. + Bytes BytesRemoved = + FrontColumnsRemoved > FrontEllipse.size() + ? (Map.columnToByte(FrontColumnsRemoved) - Bytes(FrontEllipse.size())) + : 0; + Bytes CodeEnd = + CaretEnd < Map.columns() ? Map.columnToByte(CaretEnd.V) : CaretEnd.V; + for (TextDiagnostic::StyleRange &R : Styles) { + // Remove style ranges before and after the new truncated snippet. + if (R.Start >= static_cast<unsigned>(CodeEnd.V) || + R.End < static_cast<unsigned>(BytesRemoved.V)) { + R.Start = R.End = std::numeric_limits<int>::max(); + continue; + } + // Move them left. (Note that this can wrap R.Start, but that doesn't + // matter). + R.Start -= BytesRemoved.V; + R.End -= BytesRemoved.V; + + // Don't leak into the ellipse at the end. + if (R.Start < static_cast<unsigned>(CodeEnd.V) && + R.End > static_cast<unsigned>(CodeEnd.V)) + R.End = CodeEnd.V + 1; // R.End is inclusive. + } + // The line needs some truncation, and we'd prefer to keep the front // if possible, so remove the back if (BackColumnsRemoved > Columns(BackEllipse.size())) SourceLine.replace(SourceEnd.V, std::string::npos, BackEllipse); // If that's enough then we're done - if (FrontColumnsRemoved + ColumnsKept <= Columns(NonGutterColumns)) + if (FrontColumnsRemoved + ColumnsKept <= NonGutterColumns) return; // Otherwise remove the front as well @@ -1391,6 +1420,11 @@ void TextDiagnostic::emitSnippetAndCaret( OS.indent(MaxLineNoDisplayWidth + 2) << "| "; }; + Columns MessageLength = DiagOpts.MessageLength; + // If we don't have enough columns available, just abort now. + if (MessageLength != 0 && MessageLength <= Columns(MaxLineNoDisplayWidth + 4)) + return; + // Prepare source highlighting information for the lines we're about to // emit, starting from the first line. std::unique_ptr<SmallVector<StyleRange>[]> SourceStyles = @@ -1450,10 +1484,14 @@ void TextDiagnostic::emitSnippetAndCaret( // If the source line is too long for our terminal, select only the // "interesting" source region within that line. - Columns MessageLength = DiagOpts.MessageLength; - if (MessageLength.V != 0) + if (MessageLength != 0) { + Columns NonGutterColumns = MessageLength; + if (MaxLineNoDisplayWidth != 0) + NonGutterColumns -= Columns(MaxLineNoDisplayWidth + 4); selectInterestingSourceRegion(SourceLine, CaretLine, FixItInsertionLine, - MessageLength, SourceColMap); + NonGutterColumns, SourceColMap, + SourceStyles[LineNo - Lines.first]); + } // If we are in -fdiagnostics-print-source-range-info mode, we are trying // to produce easily machine parsable output. Add a space before the @@ -1508,7 +1546,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine, // Print the source line one character at a time. bool PrintReversed = false; std::optional<llvm::raw_ostream::Colors> CurrentColor; - size_t I = 0; + size_t I = 0; // Bytes. while (I < SourceLine.size()) { auto [Str, WasPrintable] = printableTextForNextCharacter(SourceLine, &I, DiagOpts.TabStop); diff --git a/clang/test/Frontend/diags-interesting-source-region-colors.cpp b/clang/test/Frontend/diags-interesting-source-region-colors.cpp new file mode 100644 index 0000000000000..80db0873b9e0a --- /dev/null +++ b/clang/test/Frontend/diags-interesting-source-region-colors.cpp @@ -0,0 +1,30 @@ +// RUN: not %clang_cc1 %s -fmessage-length=40 -fcolor-diagnostics -fno-show-source-location -Wunused-value -o - 2>&1 | FileCheck %s + +// REQUIRES: ansi-escape-sequences + +int main() { + 1 + + if; + // CHECK: expected expression + // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]]; + + /*😂*/1 + + if; + // CHECK: expected expression + // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]]; + + a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1; + // CHECK: use of undeclared identifier + // CHECK-NEXT: a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] ... + + + /*😂😂😂*/ a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1; + // CHECK: use of undeclared identifier + // CHECK-NEXT: [[YELLOW:.\[0;33m]]/*😂😂😂*/[[RESET]] a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] ... + + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + // CHECK: [[GREEN:.\[0;32m]]"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]]; + + "😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + // CHECK: [[GREEN:.\[0;32m]]"😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]]; +} + + From 28d3194d113fbbe0534fe81398045e2d56d40886 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser <nikolasklauser@berlin.de> Date: Mon, 3 Nov 2025 16:47:12 +0100 Subject: [PATCH 023/313] [libc++] Merge basic_string::__{replace,reset}_internal_buffer (#165404) --- libcxx/include/string | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/libcxx/include/string b/libcxx/include/string index 8f80afbc2fd37..33382c7af4b2c 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -914,6 +914,10 @@ private: union __rep { __short __s; __long __l; + + __rep() = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__short __r) : __s(__r) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__long __r) : __l(__r) {} }; _LIBCPP_COMPRESSED_PAIR(__rep, __rep_, allocator_type, __alloc_); @@ -2259,18 +2263,12 @@ private: return __long(__buffer, __capacity); } - // Deallocate the long buffer if it exists and clear the short buffer so we are an empty string - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer() { + // Replace the current buffer with __new_rep. Deallocate the old long buffer if it exists. + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer(__rep __new_rep = __short()) { __annotate_delete(); if (__is_long()) __alloc_traits::deallocate(__alloc_, __get_long_pointer(), __get_long_cap()); - __rep_.__s = __short(); - } - - // Replace the current buffer with __alloc; the first __size elements constitute a string - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __replace_internal_buffer(__long __alloc) { - __reset_internal_buffer(); - __rep_.__l = __alloc; + __rep_ = __new_rep; } // Initialize the internal buffer to hold __size elements @@ -2444,7 +2442,7 @@ private: __annotate_delete(); auto __guard = std::__make_scope_guard(__annotate_new_size(*this)); auto __alloc = __str.__alloc_; - __replace_internal_buffer(__allocate_long_buffer(__alloc, __str.size())); + __reset_internal_buffer(__allocate_long_buffer(__alloc, __str.size())); __alloc_ = std::move(__alloc); } } @@ -2710,7 +2708,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__ __sec_cp_sz); __buffer.__size_ = __n_copy + __n_add + __sec_cp_sz; traits_type::assign(__buffer.__data_[__buffer.__size_], value_type()); - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); } // __grow_by is deprecated because it does not set the size. It may not update the size when the size is changed, and it @@ -2746,7 +2744,7 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait // This is -1 to make sure the caller sets the size properly, since old versions of this function didn't set the size // at all. __buffer.__size_ = -1; - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); } template <class _CharT, class _Traits, class _Allocator> @@ -3394,7 +3392,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re __long __buffer = __allocate_long_buffer(__alloc_, __requested_capacity); __buffer.__size_ = size(); traits_type::copy(std::__to_address(__buffer.__data_), data(), __buffer.__size_ + 1); - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); } template <class _CharT, class _Traits, class _Allocator> @@ -3433,7 +3431,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat } traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__get_long_pointer()), __size + 1); - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); # if _LIBCPP_HAS_EXCEPTIONS } catch (...) { return; From 3bd59634d2d21328e32c06ecd6a7a1188a1b1993 Mon Sep 17 00:00:00 2001 From: William Moses <gh@wsmoses.com> Date: Mon, 3 Nov 2025 09:57:21 -0600 Subject: [PATCH 024/313] [MLIR][SCF] Speed up ConditionPropagation (#166080) Introduce a cache to avoid looking up then/else region nesting through `isAncestor` calls repeatedly. This gets expensive for large inputs with lots of pointer chasing. Fixes https://github.com/llvm/llvm-project/issues/166039 --- mlir/lib/Dialect/SCF/IR/SCF.cpp | 47 ++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index 2946b53c8cb36..881e256a8797b 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -2565,6 +2565,39 @@ struct ConvertTrivialIfToSelect : public OpRewritePattern<IfOp> { struct ConditionPropagation : public OpRewritePattern<IfOp> { using OpRewritePattern<IfOp>::OpRewritePattern; + /// Kind of parent region in the ancestor cache. + enum class Parent { Then, Else, None }; + + /// Returns the kind of region ("then", "else", or "none") of the + /// IfOp that the given region is transitively nested in. Updates + /// the cache accordingly. + static Parent getParentType(Region *toCheck, IfOp op, + DenseMap<Region *, Parent> &cache, + Region *endRegion) { + SmallVector<Region *> seen; + while (toCheck != endRegion) { + auto found = cache.find(toCheck); + if (found != cache.end()) + return found->second; + seen.push_back(toCheck); + if (&op.getThenRegion() == toCheck) { + for (Region *region : seen) + cache[region] = Parent::Then; + return Parent::Then; + } + if (&op.getElseRegion() == toCheck) { + for (Region *region : seen) + cache[region] = Parent::Else; + return Parent::Else; + } + toCheck = toCheck->getParentRegion(); + } + + for (Region *region : seen) + cache[region] = Parent::None; + return Parent::None; + } + LogicalResult matchAndRewrite(IfOp op, PatternRewriter &rewriter) const override { // Early exit if the condition is constant since replacing a constant @@ -2580,9 +2613,12 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> { Value constantTrue = nullptr; Value constantFalse = nullptr; + DenseMap<Region *, Parent> cache; for (OpOperand &use : llvm::make_early_inc_range(op.getCondition().getUses())) { - if (op.getThenRegion().isAncestor(use.getOwner()->getParentRegion())) { + switch (getParentType(use.getOwner()->getParentRegion(), op, cache, + op.getCondition().getParentRegion())) { + case Parent::Then: { changed = true; if (!constantTrue) @@ -2591,8 +2627,9 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> { rewriter.modifyOpInPlace(use.getOwner(), [&]() { use.set(constantTrue); }); - } else if (op.getElseRegion().isAncestor( - use.getOwner()->getParentRegion())) { + break; + } + case Parent::Else: { changed = true; if (!constantFalse) @@ -2601,6 +2638,10 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> { rewriter.modifyOpInPlace(use.getOwner(), [&]() { use.set(constantFalse); }); + break; + } + case Parent::None: + break; } } From bf5332cd8266cddb3bfb2e8436a2161c10602855 Mon Sep 17 00:00:00 2001 From: Konrad Kleine <kkleine@redhat.com> Date: Mon, 3 Nov 2025 17:00:06 +0100 Subject: [PATCH 025/313] [flang][driver] Bring --gcc-triple to flang (#165886) When there are multiple gcc versions installed, we want `flang` to be able to find the right one based on the triple. Here's `flang` selecting an unwanted `gcc` candidate installation, namely `/usr/lib/gcc/x86_64-linux-gnu/15`: ``` ~/src/llvm-project/main/build-RelWithDebInfo > ./bin/flang -v flang version 22.0.0custombuild Target: x86_64-redhat-linux-gnu Thread model: posix InstalledDir: /home/fedora/src/llvm-project/main/build-RelWithDebInfo/bin System configuration file directory: /etc/clang/ Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/15 Found candidate GCC installation: /usr/lib/gcc/x86_64-redhat-linux/15 Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/15 Candidate multilib: .;@m64 Candidate multilib: 32;@m32 Selected multilib: .;@m64 ``` When passing `--gcc-triple=x86_64-redhat-linux` we get the desired gcc candidate installation: ``` ~/src/llvm-project/main/build-RelWithDebInfo > ./bin/flang --gcc-triple=x86_64-redhat-linux -v flang version 22.0.0custombuild Target: x86_64-redhat-linux-gnu Thread model: posix InstalledDir: /home/fedora/src/llvm-project/main/build-RelWithDebInfo/bin System configuration file directory: /etc/clang/ Found candidate GCC installation: /usr/lib/gcc/x86_64-redhat-linux/15 Selected GCC installation: /usr/lib/gcc/x86_64-redhat-linux/15 Candidate multilib: .;@m64 Candidate multilib: 32;@m32 Selected multilib: .;@m64 ``` * Test: `LIT_FILTER="Flang :: Driver/gcc-triple.f90" ninja check-flang` * Copied `flang/test/Driver/Inputs/fedora_39_tree` from `clang/test/Driver/Inputs/fedora_39_tree`. * Testing what default triple is selected when two are possible. * Testing that we can select an existing triple. * Testing that triple is not selected if it doesn't exist. --- clang/include/clang/Driver/Options.td | 1 + .../usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o | 0 .../usr/lib/gcc/x86_64-linux-gnu/13/crtend.o | 0 .../usr/lib/gcc/x86_64-linux-gnu/13/crti.o | 0 .../usr/lib/gcc/x86_64-linux-gnu/13/crtn.o | 0 .../lib/gcc/x86_64-redhat-linux/13/crtbegin.o | 0 .../lib/gcc/x86_64-redhat-linux/13/crtend.o | 0 .../usr/lib/gcc/x86_64-redhat-linux/13/crti.o | 0 .../usr/lib/gcc/x86_64-redhat-linux/13/crtn.o | 0 flang/test/Driver/gcc-triple.f90 | 18 ++++++++++++++++++ 10 files changed, 19 insertions(+) create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o create mode 100644 flang/test/Driver/gcc-triple.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 20955ef1b852e..4778b87b789a9 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -741,6 +741,7 @@ def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[NoXarchOption]>, "Specify a directory where Flang can find 'lib{,32,64}/gcc{,-cross}/$triple/$version'. " "Flang will use the GCC installation with the largest version">; def gcc_triple_EQ : Joined<["--"], "gcc-triple=">, + Visibility<[ClangOption, FlangOption]>, HelpText<"Search for the GCC installation with the specified triple.">; def CC : Flag<["-"], "CC">, Visibility<[ClangOption, CC1Option]>, Group<Preprocessor_Group>, diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/gcc-triple.f90 b/flang/test/Driver/gcc-triple.f90 new file mode 100644 index 0000000000000..027d78a7c5046 --- /dev/null +++ b/flang/test/Driver/gcc-triple.f90 @@ -0,0 +1,18 @@ +!! UNSUPPORTED: system-windows + +!! Test that --gcc-triple option is working as expected. + +! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree 2>&1 | FileCheck %s --dump-input=always --check-prefix=DEFAULT_TRIPLE +! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation: +! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13 +! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation: +! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13 +! DEFAULT_TRIPLE: {{^}}Selected GCC installation: +! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13 + +! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-redhat-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_EXISTS +! TRIPLE_EXISTS: {{^}}Selected GCC installation: +! TRIPLE_EXISTS: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13 + +! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-foo-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_DOES_NOT_EXISTS +! TRIPLE_DOES_NOT_EXISTS-NOT: x86_64-foo-linux \ No newline at end of file From 7d5659083cb21722416b38fe92b7200fe89be232 Mon Sep 17 00:00:00 2001 From: Alexey Bataev <a.bataev@outlook.com> Date: Mon, 3 Nov 2025 06:25:20 -0800 Subject: [PATCH 026/313] [SLP]Do not create copyable node, if parent node is non-schedulable and has a use in binop. If the parent node is non-schedulable (only externally used instructions), and at least one instruction has multiple uses and used in the binop, such copyable node should be created. Otherwise, it may contain wrong def-use chain model, which cannot be effective detected. Fixes #166035 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 21 ++++++++++ .../parent-non-schedule-multi-use-in-binop.ll | 40 +++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 34b405ced8c0a..bf3f52c51b64c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20975,6 +20975,27 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, if (isa<PHINode>(S.getMainOp()) || isVectorLikeInstWithConstOps(S.getMainOp())) return nullptr; + // If the parent node is non-schedulable and the current node is copyable, and + // any of parent instructions are used outside several basic blocks or in + // bin-op node - cancel scheduling, it may cause wrong def-use deps in + // analysis, leading to a crash. + // Non-scheduled nodes may not have related ScheduleData model, which may lead + // to a skipped dep analysis. + if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && + EI.UserTE->doesNotNeedToSchedule() && + EI.UserTE->getOpcode() != Instruction::PHI && + any_of(EI.UserTE->Scalars, [](Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I || I->hasOneUser()) + return false; + for (User *U : I->users()) { + auto *UI = cast<Instruction>(U); + if (isa<BinaryOperator>(UI)) + return true; + } + return false; + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll new file mode 100644 index 0000000000000..590b0be973002 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S --mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +@a = common global [100 x i64] zeroinitializer, align 64 + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[TMP0]], splat (i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 1) +; CHECK-NEXT: br i1 false, label %[[LOP_RHSCNT_I_PEEL:.*]], label %[[LAND_END_I_PEEL:.*]] +; CHECK: [[LOP_RHSCNT_I_PEEL]]: +; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i64> [[TMP1]], <i64 1, i64 0> +; CHECK-NEXT: br label %[[LAND_END_I_PEEL]] +; CHECK: [[LAND_END_I_PEEL]]: +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP4]], %[[LOP_RHSCNT_I_PEEL]] ] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 +; CHECK-NEXT: ret void +; +entry: + %.promoted104.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8 + %.promoted103.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 + %0 = add i64 %.promoted104.i, 1 + %1 = add i64 %.promoted103.i, 1 + %2 = add i64 %0, 1 + br i1 false, label %lop.rhscnt.i.peel, label %land.end.i.peel + +lop.rhscnt.i.peel: + %3 = or i64 %1, 1 + br label %land.end.i.peel + +land.end.i.peel: + %4 = phi i64 [ %2, %entry ], [ %0, %lop.rhscnt.i.peel ] + %5 = phi i64 [ %1, %entry ], [ %3, %lop.rhscnt.i.peel ] + store i64 %5, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 + store i64 %4, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8 + ret void +} From 5b7a5f7c5f3c1dcd8ea9debf56983b85e393bc86 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <llvm-dev@redking.me.uk> Date: Mon, 3 Nov 2025 16:03:54 +0000 Subject: [PATCH 027/313] [X86] Narrow BT/BTC/BTR/BTS compare + RMW patterns on very large integers (REAPPLIED) (#166176) This patch allows us to narrow single bit-test/twiddle operations for larger than legal scalar integers to efficiently operate just on the i32 sub-integer block actually affected. The BITOP(X,SHL(1,IDX)) patterns are split, with the IDX used to access the specific i32 block as well as specific bit within that block. BT comparisons are relatively simple, and builds on the truncated shifted loads fold from #165266. BTC/BTR/BTS bit twiddling patterns need to match the entire RMW pattern to safely confirm only one block is affected, but a similar approach is taken and creates codegen that should allow us to further merge with matching BT opcodes in a future patch (see #165291). The resulting codegen is notably more efficient than the heavily micro-coded memory folded variants of BT/BTC/BTR/BTS. There is still some work to improve the bit insert 'init' patterns included in bittest-big-integer.ll but I'm expecting this to be a straightforward future extension. REAPPLIED from #165540 which was reverted due to a sanitizer regression that should have been fixed by #166160 Fixes #164225 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 114 +- llvm/test/CodeGen/X86/bittest-big-integer.ll | 7521 +++--------------- 2 files changed, 1208 insertions(+), 6427 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e5b2743f602da..2970cf42df731 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53347,6 +53347,80 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Look for a RMW operation that only touches one bit of a larger than legal +// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; + + // Only handle normal stores and its chain was a matching normal load. + auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); + if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || + !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset()) + return SDValue(); + + SDValue LoadVal(Ld, 0); + SDValue StoredVal = St->getValue(); + EVT VT = StoredVal.getValueType(); + + // Only narrow larger than legal scalar integers. + if (!VT.isScalarInteger() || + VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) + return SDValue(); + + // BTR: X & ~(1 << ShAmt) + // BTS: X | (1 << ShAmt) + // BTC: X ^ (1 << ShAmt) + SDValue ShAmt; + if (!StoredVal.hasOneUse() || + !(sd_match(StoredVal, m_And(m_Specific(LoadVal), + m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || + sd_match(StoredVal, + m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match(StoredVal, + m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) + return SDValue(); + + // Ensure the shift amount is in bounds. + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) + return SDValue(); + + // Split the shift into an alignment shift that moves the active i32 block to + // the bottom bits for truncation and a modulo shift that can act on the i32. + EVT AmtVT = ShAmt.getValueType(); + SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getSignedConstant(-32LL, DL, AmtVT)); + SDValue ModuloAmt = + DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + + // Compute the byte offset for the i32 block that is changed by the RMW. + // combineTruncate will adjust the load for us in a similar way. + EVT PtrVT = St->getBasePtr().getValueType(); + SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT); + SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, + DAG.getShiftAmountConstant(3, PtrVT, DL)); + SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL, + SDNodeFlags::NoUnsignedWrap); + + // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. + SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + + SDValue Mask = + DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), + DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + + SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), + Align(), St->getMemOperand()->getFlags()); +} + static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -53573,6 +53647,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } } + if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) + return R; + // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC) // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC) if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && @@ -54505,8 +54582,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, // truncation, see if we can convert the shift into a pointer offset instead. // Limit this to normal (non-ext) scalar integer loads. if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && - Src.hasOneUse() && Src.getOperand(0).hasOneUse() && - ISD::isNormalLoad(Src.getOperand(0).getNode())) { + Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) && + (Src.getOperand(0).hasOneUse() || + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) { auto *Ld = cast<LoadSDNode>(Src.getOperand(0)); if (Ld->isSimple() && VT.isByteSized() && isPowerOf2_64(VT.getSizeInBits())) { @@ -56305,6 +56383,7 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); const SDValue LHS = N->getOperand(0); const SDValue RHS = N->getOperand(1); @@ -56363,6 +56442,37 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (SDValue AndN = MatchAndCmpEq(RHS, LHS)) return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); + // If we're performing a bit test on a larger than legal type, attempt + // to (aligned) shift down the value to the bottom 32-bits and then + // perform the bittest on the i32 value. + // ICMP_ZERO(AND(X,SHL(1,IDX))) + // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31)))) + if (isNullConstant(RHS) && + OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) { + SDValue X, ShAmt; + if (sd_match(LHS, m_OneUse(m_And(m_Value(X), + m_Shl(m_One(), m_Value(ShAmt)))))) { + // Only attempt this if the shift amount is known to be in bounds. + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) { + EVT AmtVT = ShAmt.getValueType(); + SDValue AlignAmt = + DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getSignedConstant(-32LL, DL, AmtVT)); + SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getConstant(31, DL, AmtVT)); + SDValue Mask = DAG.getNode( + ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), + DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); + X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt); + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask); + return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32), + CC); + } + } + } + // cmpeq(trunc(x),C) --> cmpeq(x,C) // cmpne(trunc(x),C) --> cmpne(x,C) // iff x upper bits are zero. diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 8007d9dcf13bc..c311ab869c311 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB5_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: .LBB5_2: -; X86-NEXT: andl 4(%eax), %esi -; X86-NEXT: andl (%eax), %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: setne %al -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $32, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: test_ne_i64: @@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB6_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB6_2: -; X86-NEXT: movl (%edx), %ecx -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: andl %esi, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: andl %eax, %ebp -; X86-NEXT: xorl %esi, %edi -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: setne %al -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: complement_ne_i64: @@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB7_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: .LBB7_2: -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: movl 4(%edx), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: notl %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: andl %esi, %ebp -; X86-NEXT: notl %esi -; X86-NEXT: andl %ecx, %edi -; X86-NEXT: andl %eax, %esi -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: sete %al -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: reset_eq_i64: @@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB8_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB8_2: -; X86-NEXT: movl (%edx), %ecx -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: andl %esi, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: andl %eax, %ebp -; X86-NEXT: orl %esi, %edi -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: setne %al -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: set_ne_i64: @@ -419,52 +353,47 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shll %cl, %esi +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB9_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl $0, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl $0, %edx ; X86-NEXT: .LBB9_2: -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: notl %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: notl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: notl %esi +; X86-NEXT: notl %edx ; X86-NEXT: je .LBB9_4 ; X86-NEXT: # %bb.3: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB9_4: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl (%edi), %ecx -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: andl %ecx, %ebp -; X86-NEXT: orl %esi, %ebp -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %ebp, (%edi) -; X86-NEXT: movl %ebx, 4(%edi) -; X86-NEXT: sete %al +; X86-NEXT: andl 4(%ebx), %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: andl (%ebx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $32, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl (%ebx,%eax), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setae %al +; X86-NEXT: movl %edx, (%ebx) +; X86-NEXT: movl %esi, 4(%ebx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i64: @@ -516,112 +445,160 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $96, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $96, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %ld = load i128, ptr %word + %test = and i128 %ld, %bit + %cmp = icmp ne i128 %test, 0 + ret i1 %cmp +} + +define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_ne_i128: +; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $48, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, (%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %esi -; X86-NEXT: movl 24(%esp,%esi), %edi -; X86-NEXT: movl 28(%esp,%esi), %eax -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl 16(%esp,%esi), %edx -; X86-NEXT: movl 20(%esp,%esi), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: andl 8(%ebx), %edi -; X86-NEXT: andl (%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: andl 12(%ebx), %eax -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: test_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %rsi, %rax -; SSE-NEXT: andq 8(%rdi), %rdx -; SSE-NEXT: andq (%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: setne %al -; SSE-NEXT: retq +; X64-LABEL: complement_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btcl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %ld = load i128, ptr %word + %test = and i128 %ld, %bit + %res = xor i128 %ld, %bit + %cmp = icmp ne i128 %test, 0 + store i128 %res, ptr %word + ret i1 %cmp +} + +define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { +; X86-LABEL: reset_eq_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl ; -; AVX2-LABEL: test_ne_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: movl $1, %edx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %rdx, %rsi -; AVX2-NEXT: shlxq %rcx, %rdx, %rdx -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rdx, %rsi -; AVX2-NEXT: cmovneq %rax, %rdx -; AVX2-NEXT: andq 8(%rdi), %rsi -; AVX2-NEXT: andq (%rdi), %rdx -; AVX2-NEXT: orq %rsi, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq +; X64-LABEL: reset_eq_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setae %al +; X64-NEXT: btrl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %mask = xor i128 %bit, -1 + %ld = load i128, ptr %word + %test = and i128 %ld, %bit + %res = and i128 %ld, %mask + %cmp = icmp eq i128 %test, 0 + store i128 %res, ptr %word + ret i1 %cmp +} + +define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { +; X86-LABEL: set_ne_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl ; -; AVX512-LABEL: test_ne_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shlxq %rcx, %rax, %rax -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rax, %rdx -; AVX512-NEXT: cmovneq %rsi, %rax -; AVX512-NEXT: andq 8(%rdi), %rdx -; AVX512-NEXT: andq (%rdi), %rax -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: setne %al -; AVX512-NEXT: retq +; X64-LABEL: set_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btsl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs %ld = load i128, ptr %word %test = and i128 %ld, %bit + %res = or i128 %ld, %bit %cmp = icmp ne i128 %test, 0 + store i128 %res, ptr %word ret i1 %cmp } -define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { -; X86-LABEL: complement_ne_i128: +define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { +; X86-LABEL: init_eq_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp @@ -629,8 +606,9 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: subl $96, %esp +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movzbl 16(%ebp), %ebx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -644,179 +622,85 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { ; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %esi -; X86-NEXT: movl 60(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %edi -; X86-NEXT: movl 52(%esp,%eax), %ebx -; X86-NEXT: shldl %cl, %ebx, %esi +; X86-NEXT: movl 64(%esp,%eax), %edx +; X86-NEXT: movl 68(%esp,%eax), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl 8(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 12(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movl 72(%esp,%esi), %ebx +; X86-NEXT: movl 76(%esp,%esi), %esi +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: shldl %cl, %ebx, %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: complement_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: xorq %rcx, %rsi -; SSE-NEXT: xorq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: setne %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) -; SSE-NEXT: retq -; -; AVX-LABEL: complement_ne_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: andq %rsi, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: andq %rdx, %r9 -; AVX-NEXT: xorq %rcx, %rsi -; AVX-NEXT: xorq %rax, %rdx -; AVX-NEXT: orq %r8, %r9 -; AVX-NEXT: setne %al -; AVX-NEXT: movq %rdx, (%rdi) -; AVX-NEXT: movq %rsi, 8(%rdi) -; AVX-NEXT: retq - %rem = and i32 %position, 127 - %ofs = zext nneg i32 %rem to i128 - %bit = shl nuw i128 1, %ofs - %ld = load i128, ptr %word - %test = and i128 %ld, %bit - %res = xor i128 %ld, %bit - %cmp = icmp ne i128 %test, 0 - store i128 %res, ptr %word - ret i1 %cmp -} - -define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { -; X86-LABEL: reset_eq_i128: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: notl %edi ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %edx -; X86-NEXT: movl 60(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %esi -; X86-NEXT: movl 52(%esp,%eax), %edi -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl 8(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl (%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebx), %eax +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl 36(%esp,%ecx), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl 4(%ebx), %ebx -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl %ebx, %ecx +; X86-NEXT: movl 40(%esp,%ecx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: andl 8(%eax), %edi +; X86-NEXT: orl %edx, %edi ; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: movl %edx, 8(%edi) -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl %esi, (%edi) -; X86-NEXT: movl %ecx, 4(%edi) -; X86-NEXT: sete %al +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl 44(%esp,%eax), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: andl 12(%ecx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: notl %ebx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl 32(%esp,%eax), %edx +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: andl (%eax), %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: notl %edx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: andl 4(%ecx), %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl 12(%ebp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: andl $96, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl (%ecx,%eax), %eax +; X86-NEXT: btl %esi, %eax +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: setae %al ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -824,433 +708,274 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: reset_eq_i128: +; SSE-LABEL: init_eq_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: movl $1, %esi +; SSE-NEXT: xorl %r8d, %r8d +; SSE-NEXT: shldq %cl, %rsi, %r8 +; SSE-NEXT: shlq %cl, %rsi +; SSE-NEXT: movl %edx, %eax +; SSE-NEXT: xorl %edx, %edx +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: shlq %cl, %rax +; SSE-NEXT: xorl %r9d, %r9d ; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi +; SSE-NEXT: cmovneq %rsi, %r8 +; SSE-NEXT: cmovneq %r9, %rsi +; SSE-NEXT: notq %r8 ; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 +; SSE-NEXT: cmovneq %r9, %rax ; SSE-NEXT: notq %rsi -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: notq %rdx -; SSE-NEXT: andq %rcx, %rsi -; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: sete %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: andq 8(%rdi), %r8 +; SSE-NEXT: orq %rdx, %r8 +; SSE-NEXT: andq (%rdi), %rsi +; SSE-NEXT: orq %rax, %rsi +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: andl $96, %eax +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: movl (%rdi,%rax), %eax +; SSE-NEXT: btl %ecx, %eax +; SSE-NEXT: setae %al +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: movq %r8, 8(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: reset_eq_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: andnq %rcx, %rsi, %r8 -; AVX-NEXT: andq %rsi, %rcx -; AVX-NEXT: andnq %rax, %rdx, %rsi -; AVX-NEXT: andq %rdx, %rax -; AVX-NEXT: orq %rcx, %rax -; AVX-NEXT: sete %al -; AVX-NEXT: movq %rsi, (%rdi) -; AVX-NEXT: movq %r8, 8(%rdi) -; AVX-NEXT: retq +; AVX2-LABEL: init_eq_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: movl $1, %eax +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %rax, %rsi +; AVX2-NEXT: movl %edx, %edx +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: shldq %cl, %rdx, %r8 +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: shlxq %rcx, %rax, %rax +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %rax, %rsi +; AVX2-NEXT: cmovneq %r9, %rax +; AVX2-NEXT: shlxq %rcx, %rdx, %rdx +; AVX2-NEXT: cmovneq %rdx, %r8 +; AVX2-NEXT: cmovneq %r9, %rdx +; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: andnq (%rdi), %rax, %r8 +; AVX2-NEXT: orq %rdx, %r8 +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: andl $96, %eax +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: movl (%rdi,%rax), %eax +; AVX2-NEXT: btl %ecx, %eax +; AVX2-NEXT: setae %al +; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: init_eq_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl $1, %eax +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %rax, %rsi +; AVX512-NEXT: xorl %r8d, %r8d +; AVX512-NEXT: shlxq %rcx, %rax, %rax +; AVX512-NEXT: movl %edx, %edx +; AVX512-NEXT: xorl %r9d, %r9d +; AVX512-NEXT: shldq %cl, %rdx, %r9 +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %rax, %rsi +; AVX512-NEXT: cmovneq %r8, %rax +; AVX512-NEXT: shlxq %rcx, %rdx, %rdx +; AVX512-NEXT: cmovneq %rdx, %r9 +; AVX512-NEXT: cmovneq %r8, %rdx +; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi +; AVX512-NEXT: orq %r9, %rsi +; AVX512-NEXT: andnq (%rdi), %rax, %r8 +; AVX512-NEXT: orq %rdx, %r8 +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: andl $96, %eax +; AVX512-NEXT: shrl $3, %eax +; AVX512-NEXT: movl (%rdi,%rax), %eax +; AVX512-NEXT: btl %ecx, %eax +; AVX512-NEXT: setae %al +; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: movq %rsi, 8(%rdi) +; AVX512-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs %mask = xor i128 %bit, -1 + %val0 = zext i1 %value to i128 + %val = shl nuw i128 %val0, %ofs %ld = load i128, ptr %word %test = and i128 %ld, %bit - %res = and i128 %ld, %mask + %res0 = and i128 %ld, %mask + %res = or i128 %res0, %val %cmp = icmp eq i128 %test, 0 store i128 %res, ptr %word ret i1 %cmp } -define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { -; X86-LABEL: set_ne_i128: +; i512 + +define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: test_ne_i512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: andl $60, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + ret i1 %cmp +} + +define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_ne_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %esi -; X86-NEXT: movl 60(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %edi -; X86-NEXT: movl 52(%esp,%eax), %ebx -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl 8(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 12(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: set_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: orq %rcx, %rsi -; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: setne %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) -; SSE-NEXT: retq -; -; AVX-LABEL: set_ne_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: andq %rsi, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: andq %rdx, %r9 -; AVX-NEXT: orq %rcx, %rsi -; AVX-NEXT: orq %rax, %rdx -; AVX-NEXT: orq %r8, %r9 -; AVX-NEXT: setne %al -; AVX-NEXT: movq %rdx, (%rdi) -; AVX-NEXT: movq %rsi, 8(%rdi) -; AVX-NEXT: retq - %rem = and i32 %position, 127 - %ofs = zext nneg i32 %rem to i128 - %bit = shl nuw i128 1, %ofs - %ld = load i128, ptr %word - %test = and i128 %ld, %bit - %res = or i128 %ld, %bit - %cmp = icmp ne i128 %test, 0 - store i128 %res, ptr %word +; X64-LABEL: complement_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btcl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = xor i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + store i512 %res, ptr %word ret i1 %cmp } -define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { -; X86-LABEL: init_eq_i128: +define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: reset_eq_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $128, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movzbl 16(%ebp), %eax -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrb $3, %dl -; X86-NEXT: andb $12, %dl -; X86-NEXT: negb %dl -; X86-NEXT: movsbl %dl, %esi -; X86-NEXT: movl 64(%esp,%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 72(%esp,%esi), %ebx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 76(%esp,%esi), %edi -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %edi, %esi -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl 12(%ecx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 100(%esp,%ecx), %edi -; X86-NEXT: movl 104(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 108(%esp,%ebx), %ebx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 96(%esp,%ebx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: orl %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %eax, (%ecx) -; X86-NEXT: movl %edx, 4(%ecx) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: init_eq_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %esi -; SSE-NEXT: xorl %r8d, %r8d -; SSE-NEXT: shldq %cl, %rsi, %r8 -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %r9d, %r9d -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rsi, %r8 -; SSE-NEXT: cmovneq %r9, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %r9, %rax -; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: movq 8(%rdi), %r9 -; SSE-NEXT: movq %r9, %r10 -; SSE-NEXT: andq %r8, %r10 -; SSE-NEXT: notq %r8 -; SSE-NEXT: movq %rcx, %r11 -; SSE-NEXT: andq %rsi, %r11 -; SSE-NEXT: notq %rsi -; SSE-NEXT: andq %r9, %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq %rcx, %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: orq %r10, %r11 -; SSE-NEXT: sete %al -; SSE-NEXT: movq %rsi, (%rdi) -; SSE-NEXT: movq %r8, 8(%rdi) -; SSE-NEXT: retq -; -; AVX2-LABEL: init_eq_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %esi -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: xorl %r8d, %r8d -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: shldq %cl, %rdx, %r9 -; AVX2-NEXT: shlxq %rcx, %rsi, %rsi -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rsi, %rax -; AVX2-NEXT: cmovneq %r8, %rsi -; AVX2-NEXT: shlxq %rcx, %rdx, %rcx -; AVX2-NEXT: cmovneq %rcx, %r9 -; AVX2-NEXT: cmovneq %r8, %rcx -; AVX2-NEXT: movq (%rdi), %rdx -; AVX2-NEXT: movq 8(%rdi), %r8 -; AVX2-NEXT: andnq %r8, %rax, %r10 -; AVX2-NEXT: andq %rax, %r8 -; AVX2-NEXT: andnq %rdx, %rsi, %r11 -; AVX2-NEXT: andq %rsi, %rdx -; AVX2-NEXT: orq %r9, %r10 -; AVX2-NEXT: orq %rcx, %r11 -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: sete %al -; AVX2-NEXT: movq %r11, (%rdi) -; AVX2-NEXT: movq %r10, 8(%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movl $1, %esi -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: shldq %cl, %rsi, %r8 -; AVX512-NEXT: shlxq %rcx, %rsi, %rsi -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: xorl %r9d, %r9d -; AVX512-NEXT: shldq %cl, %rdx, %r9 -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rsi, %r8 -; AVX512-NEXT: cmovneq %rax, %rsi -; AVX512-NEXT: shlxq %rcx, %rdx, %rcx -; AVX512-NEXT: cmovneq %rcx, %r9 -; AVX512-NEXT: cmovneq %rax, %rcx -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rdx -; AVX512-NEXT: andnq %rdx, %r8, %r10 -; AVX512-NEXT: andq %r8, %rdx -; AVX512-NEXT: andnq %rax, %rsi, %r8 -; AVX512-NEXT: andq %rsi, %rax -; AVX512-NEXT: orq %r9, %r10 -; AVX512-NEXT: orq %rcx, %r8 -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: sete %al -; AVX512-NEXT: movq %r8, (%rdi) -; AVX512-NEXT: movq %r10, 8(%rdi) -; AVX512-NEXT: retq - %rem = and i32 %position, 127 - %ofs = zext nneg i32 %rem to i128 - %bit = shl nuw i128 1, %ofs - %mask = xor i128 %bit, -1 - %val0 = zext i1 %value to i128 - %val = shl nuw i128 %val0, %ofs - %ld = load i128, ptr %word - %test = and i128 %ld, %bit - %res0 = and i128 %ld, %mask - %res = or i128 %res0, %val - %cmp = icmp eq i128 %test, 0 - store i128 %res, ptr %word +; X64-LABEL: reset_eq_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setae %al +; X64-NEXT: btrl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %mask = xor i512 %bit, -1 + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = and i512 %ld, %mask + %cmp = icmp eq i512 %test, 0 + store i512 %res, ptr %word ret i1 %cmp } -; i512 +define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: set_ne_i512: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: set_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btsl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = or i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} -define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: test_ne_i512: +define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { +; X86-LABEL: init_eq_i512: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp @@ -1258,14 +983,14 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $224, %esp +; X86-NEXT: subl $352, %esp # imm = 0x160 ; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl %edx, %eax ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1298,325 +1023,88 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %eax -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %edi +; X86-NEXT: movl 56(%eax), %esi +; X86-NEXT: movl 60(%eax), %ebx +; X86-NEXT: movl 52(%eax), %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax +; X86-NEXT: movl 48(%eax), %edi +; X86-NEXT: movl 44(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 4(%eax), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl 16(%ebp), %eax +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: andl $31, %ecx ; X86-NEXT: shldl %cl, %esi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: andl 40(%ebx), %eax -; X86-NEXT: andl 8(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 56(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 24(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: andl 44(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 12(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 60(%edi), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 28(%edi), %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: negl %edx -; X86-NEXT: movl 192(%esp,%edx), %edx +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: andl 32(%ebx), %ecx -; X86-NEXT: andl (%ebx), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: andl 16(%ebx), %edi -; X86-NEXT: andl 48(%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 36(%ebx), %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 20(%ebx), %ecx -; X86-NEXT: andl 52(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: test_ne_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq -48(%rsp,%rbx), %rdx -; SSE-NEXT: movq -40(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq -16(%rsp,%rbx), %r11 -; SSE-NEXT: movq -8(%rsp,%rbx), %r10 -; SSE-NEXT: shldq %cl, %r11, %r10 -; SSE-NEXT: movq -32(%rsp,%rbx), %r9 -; SSE-NEXT: movq -24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r8 -; SSE-NEXT: shldq %cl, %r9, %r8 -; SSE-NEXT: movq -56(%rsp,%rbx), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rdx -; SSE-NEXT: shldq %cl, %r15, %r11 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -64(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %rsi -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: andq 32(%rdi), %r9 -; SSE-NEXT: andq 48(%rdi), %r11 -; SSE-NEXT: andq 16(%rdi), %rdx -; SSE-NEXT: orq %r11, %rdx -; SSE-NEXT: andq 40(%rdi), %r8 -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: andq 24(%rdi), %rax -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: andq (%rdi), %rbx -; SSE-NEXT: orq %r9, %rbx -; SSE-NEXT: orq %rdx, %rbx -; SSE-NEXT: andq 8(%rdi), %rsi -; SSE-NEXT: orq %r8, %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: orq %rbx, %rsi -; SSE-NEXT: setne %al -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: retq -; -; AVX2-LABEL: test_ne_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rsi -; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx -; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx -; AVX2-NEXT: movq %rbx, %rax -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq -16(%rsp,%rsi), %r11 -; AVX2-NEXT: movq -8(%rsp,%rsi), %r10 -; AVX2-NEXT: shldq %cl, %r11, %r10 -; AVX2-NEXT: movq -32(%rsp,%rsi), %r9 -; AVX2-NEXT: movq -24(%rsp,%rsi), %r14 -; AVX2-NEXT: movq %r14, %r8 -; AVX2-NEXT: shldq %cl, %r9, %r8 -; AVX2-NEXT: movq -64(%rsp,%rsi), %r15 -; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: shldq %cl, %r14, %r11 -; AVX2-NEXT: shldq %cl, %rbx, %r9 -; AVX2-NEXT: shldq %cl, %r15, %rsi -; AVX2-NEXT: shlxq %rcx, %r15, %rcx -; AVX2-NEXT: andq 32(%rdi), %r9 -; AVX2-NEXT: andq 48(%rdi), %r11 -; AVX2-NEXT: andq 16(%rdi), %rdx -; AVX2-NEXT: andq 40(%rdi), %r8 -; AVX2-NEXT: andq 56(%rdi), %r10 -; AVX2-NEXT: andq 24(%rdi), %rax -; AVX2-NEXT: orq %r11, %rdx -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: andq (%rdi), %rcx -; AVX2-NEXT: orq %r9, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: andq 8(%rdi), %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: orq %rax, %rsi -; AVX2-NEXT: orq %rcx, %rsi -; AVX2-NEXT: setne %al -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx -; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq -16(%rsp,%rbx), %r11 -; AVX512-NEXT: movq -8(%rsp,%rbx), %r10 -; AVX512-NEXT: shldq %cl, %r11, %r10 -; AVX512-NEXT: movq -32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 -; AVX512-NEXT: movq %r15, %r8 -; AVX512-NEXT: shldq %cl, %r9, %r8 -; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi -; AVX512-NEXT: shldq %cl, %rsi, %rdx -; AVX512-NEXT: shldq %cl, %r15, %r11 -; AVX512-NEXT: shldq %cl, %r14, %r9 -; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx -; AVX512-NEXT: shldq %cl, %rbx, %rsi -; AVX512-NEXT: shlxq %rcx, %rbx, %rcx -; AVX512-NEXT: andq 32(%rdi), %r9 -; AVX512-NEXT: andq 48(%rdi), %r11 -; AVX512-NEXT: andq 16(%rdi), %rdx -; AVX512-NEXT: andq 40(%rdi), %r8 -; AVX512-NEXT: andq 56(%rdi), %r10 -; AVX512-NEXT: andq 24(%rdi), %rax -; AVX512-NEXT: orq %r11, %rdx -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: andq (%rdi), %rcx -; AVX512-NEXT: orq %r9, %rcx -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: andq 8(%rdi), %rsi -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rax, %rsi -; AVX512-NEXT: orq %rcx, %rsi -; AVX512-NEXT: setne %al -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - ret i1 %cmp -} - -define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: complement_ne_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $272, %esp # imm = 0x110 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx +; X86-NEXT: shldl %cl, %esi, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1632,7 +1120,6 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1649,191 +1136,131 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 56(%eax), %esi +; X86-NEXT: movl 60(%eax), %edi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: andl 60(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 52(%eax), %edi ; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 56(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 48(%eax), %esi ; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl 52(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: movl 44(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 48(%edx), %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl 56(%edx), %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 40(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 44(%edx), %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl 24(%edx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 12(%eax), %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 36(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 40(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 32(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 36(%edx), %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 28(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 32(%edx), %ebx ; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl 60(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 28(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: movl 24(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 28(%edx), %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 240(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 32(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 16(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl %esi, %edi -; X86-NEXT: movl 52(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl %ebx, 60(%edx) -; X86-NEXT: movl %edi, 56(%edx) -; X86-NEXT: movl %ecx, 52(%edx) -; X86-NEXT: movl %esi, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 20(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 24(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 16(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 20(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 12(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 16(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 8(%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: andl 12(%edx), %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: notl %ebx +; X86-NEXT: movl 4(%eax), %edi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: andl 8(%edx), %ebx +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: notl %esi +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: andl 4(%edx), %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: notl %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: andl (%edx), %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl (%edx,%eax), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 60(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 56(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 52(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 48(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 44(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 40(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, 36(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, 32(%edx) @@ -1847,15 +1274,10 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: movl %eax, 16(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 4(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: setne %al +; X86-NEXT: movl %ebx, 8(%edx) +; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: setae %al ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1863,7 +1285,7 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: complement_ne_i512: +; SSE-LABEL: init_eq_i512: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %r15 @@ -1871,94 +1293,121 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; SSE-NEXT: pushq %r13 ; SSE-NEXT: pushq %r12 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: subq $168, %rsp +; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) ; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: movl %esi, %eax +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: andl $56, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: movslq %eax, %r12 +; SSE-NEXT: movq 136(%rsp,%r12), %r9 +; SSE-NEXT: movq 144(%rsp,%r12), %rax +; SSE-NEXT: movq %rax, %rsi +; SSE-NEXT: shldq %cl, %r9, %rsi +; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 152(%rsp,%r12), %r11 +; SSE-NEXT: shldq %cl, %rax, %r11 +; SSE-NEXT: movq 120(%rsp,%r12), %r10 +; SSE-NEXT: movq 128(%rsp,%r12), %rax +; SSE-NEXT: movq %rax, %rbx +; SSE-NEXT: shldq %cl, %r10, %rbx +; SSE-NEXT: shldq %cl, %rax, %r9 +; SSE-NEXT: movq 104(%rsp,%r12), %r14 +; SSE-NEXT: movq 112(%rsp,%r12), %rax +; SSE-NEXT: movq %rax, %r15 +; SSE-NEXT: shldq %cl, %r14, %r15 +; SSE-NEXT: shldq %cl, %rax, %r10 +; SSE-NEXT: movq 96(%rsp,%r12), %rax +; SSE-NEXT: movq %rax, %r13 +; SSE-NEXT: shlq %cl, %r13 +; SSE-NEXT: shldq %cl, %rax, %r14 +; SSE-NEXT: movl %edx, %eax +; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq (%rsp,%rbx), %rsi -; SSE-NEXT: movq 8(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 32(%rsp,%rbx), %r8 -; SSE-NEXT: movq 40(%rsp,%rbx), %rbp +; SSE-NEXT: movq 8(%rsp,%r12), %r8 +; SSE-NEXT: movq 16(%rsp,%r12), %rsi +; SSE-NEXT: movq %rsi, %rbp ; SSE-NEXT: shldq %cl, %r8, %rbp -; SSE-NEXT: movq 16(%rsp,%rbx), %r9 -; SSE-NEXT: movq 24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r10 -; SSE-NEXT: shldq %cl, %r9, %r10 -; SSE-NEXT: movq -8(%rsp,%rbx), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -16(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: notq %rax +; SSE-NEXT: andq 48(%rdi), %rax +; SSE-NEXT: orq %rbp, %rax +; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: notq %rbx +; SSE-NEXT: notq %r11 +; SSE-NEXT: movq 24(%rsp,%r12), %rax +; SSE-NEXT: shldq %cl, %rsi, %rax +; SSE-NEXT: movq -8(%rsp,%r12), %rbp +; SSE-NEXT: movq (%rsp,%r12), %rdx +; SSE-NEXT: movq %rdx, %rsi +; SSE-NEXT: shldq %cl, %rbp, %rsi +; SSE-NEXT: andq 56(%rdi), %r11 +; SSE-NEXT: andq 32(%rdi), %rbx +; SSE-NEXT: orq %rax, %r11 +; SSE-NEXT: orq %rsi, %rbx +; SSE-NEXT: notq %r15 +; SSE-NEXT: shldq %cl, %rdx, %r8 +; SSE-NEXT: notq %r9 +; SSE-NEXT: andq 40(%rdi), %r9 +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: movq -24(%rsp,%r12), %rax +; SSE-NEXT: movq -16(%rsp,%r12), %rdx +; SSE-NEXT: movq %rdx, %rsi +; SSE-NEXT: shldq %cl, %rax, %rsi +; SSE-NEXT: andq 16(%rdi), %r15 +; SSE-NEXT: orq %rsi, %r15 +; SSE-NEXT: shldq %cl, %rdx, %rbp +; SSE-NEXT: notq %r10 +; SSE-NEXT: notq %r13 +; SSE-NEXT: movq -32(%rsp,%r12), %rdx +; SSE-NEXT: movq %rdx, %rsi +; SSE-NEXT: shlq %cl, %rsi +; SSE-NEXT: andq 24(%rdi), %r10 +; SSE-NEXT: andq (%rdi), %r13 +; SSE-NEXT: orq %rbp, %r10 +; SSE-NEXT: orq %rsi, %r13 +; SSE-NEXT: notq %r14 ; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: movq 24(%rdi), %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 16(%rdi), %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %r8, %r13 -; SSE-NEXT: andq %rsi, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %rcx, %r13 -; SSE-NEXT: andq %rbp, %r13 -; SSE-NEXT: andq %rax, %r15 -; SSE-NEXT: orq %r13, %r15 -; SSE-NEXT: movq 32(%rdi), %r14 -; SSE-NEXT: movq %r14, %rcx -; SSE-NEXT: andq %r9, %rcx -; SSE-NEXT: movq (%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rbx, %r13 -; SSE-NEXT: orq %rcx, %r13 -; SSE-NEXT: orq %r12, %r13 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r12 -; SSE-NEXT: andq %r10, %r12 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: andq %r11, %rax -; SSE-NEXT: orq %r12, %rax -; SSE-NEXT: orq %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: xorq %rcx, %r10 -; SSE-NEXT: xorq %r14, %r9 -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; SSE-NEXT: xorq %rdx, %r11 -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: movq %r8, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r9, 32(%rdi) -; SSE-NEXT: movq %r10, 40(%rdi) -; SSE-NEXT: movq %rsi, 16(%rdi) -; SSE-NEXT: movq %r15, 24(%rdi) -; SSE-NEXT: movq %rbx, (%rdi) -; SSE-NEXT: movq %r11, 8(%rdi) -; SSE-NEXT: setne %al -; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: shldq %cl, %rdx, %rax +; SSE-NEXT: andq 8(%rdi), %r14 +; SSE-NEXT: orq %rax, %r14 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: andl $60, %eax +; SSE-NEXT: movl (%rdi,%rax), %eax +; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; SSE-NEXT: btl %ecx, %eax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT: movq %rax, 48(%rdi) +; SSE-NEXT: movq %r11, 56(%rdi) +; SSE-NEXT: movq %rbx, 32(%rdi) +; SSE-NEXT: movq %r9, 40(%rdi) +; SSE-NEXT: movq %r15, 16(%rdi) +; SSE-NEXT: movq %r10, 24(%rdi) +; SSE-NEXT: movq %r13, (%rdi) +; SSE-NEXT: movq %r14, 8(%rdi) +; SSE-NEXT: setae %al +; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: popq %rbx ; SSE-NEXT: popq %r12 ; SSE-NEXT: popq %r13 @@ -1967,7 +1416,7 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX2-LABEL: complement_ne_i512: +; AVX2-LABEL: init_eq_i512: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %r15 @@ -1975,4540 +1424,105 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $72, %rsp +; AVX2-NEXT: subq $184, %rsp ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, (%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rbx -; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX2-NEXT: movq %rbp, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX2-NEXT: shldq %cl, %r8, %r13 -; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: shldq %cl, %r9, %r10 -; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX2-NEXT: shldq %cl, %r11, %rsi -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r8, %r14 -; AVX2-NEXT: andq %rsi, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq 56(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r13, %r15 -; AVX2-NEXT: movq 24(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %rax, %r14 -; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq (%rsp,%rbx), %rdx -; AVX2-NEXT: movq 32(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r9, %r15 -; AVX2-NEXT: shlxq %rcx, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq (%rdi), %rbx -; AVX2-NEXT: movq %rbx, %rbp -; AVX2-NEXT: andq %rax, %rbp -; AVX2-NEXT: orq %r15, %rbp -; AVX2-NEXT: orq %r12, %rbp -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: andq %r10, %rcx -; AVX2-NEXT: movq 8(%rdi), %r15 -; AVX2-NEXT: movq %r15, %r12 -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: orq %rcx, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX2-NEXT: xorq %rax, %r10 -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT: xorq %r15, %r11 -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: movq %r8, 48(%rdi) -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r9, 32(%rdi) -; AVX2-NEXT: movq %r10, 40(%rdi) -; AVX2-NEXT: movq %rsi, 16(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %rbx, (%rdi) -; AVX2-NEXT: movq %r11, 8(%rdi) -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $72, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: complement_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $72, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, (%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX512-NEXT: movq %rbp, %rax -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX512-NEXT: shldq %cl, %r8, %r13 -; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %r10 -; AVX512-NEXT: shldq %cl, %r9, %r10 -; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX512-NEXT: shldq %cl, %r11, %rsi -; AVX512-NEXT: shldq %cl, %r14, %r8 -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r8, %r14 -; AVX512-NEXT: andq %rsi, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq 56(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r13, %r15 -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %rax, %r14 -; AVX512-NEXT: orq %r15, %r14 -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: movq (%rsp,%rbx), %rdx -; AVX512-NEXT: movq 32(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r9, %r15 -; AVX512-NEXT: shlxq %rcx, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq (%rdi), %rbx -; AVX512-NEXT: movq %rbx, %rbp -; AVX512-NEXT: andq %rax, %rbp -; AVX512-NEXT: orq %r15, %rbp -; AVX512-NEXT: orq %r12, %rbp -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rdx, %r11 -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andq %r10, %rcx -; AVX512-NEXT: movq 8(%rdi), %r15 -; AVX512-NEXT: movq %r15, %r12 -; AVX512-NEXT: andq %r11, %r12 -; AVX512-NEXT: orq %rcx, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: xorq %rax, %r10 -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: xorq %r15, %r11 -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: movq %r8, 48(%rdi) -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r9, 32(%rdi) -; AVX512-NEXT: movq %r10, 40(%rdi) -; AVX512-NEXT: movq %rsi, 16(%rdi) -; AVX512-NEXT: movq %rcx, 24(%rdi) -; AVX512-NEXT: movq %rbx, (%rdi) -; AVX512-NEXT: movq %r11, 8(%rdi) -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $72, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = xor i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: reset_eq_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $288, %esp # imm = 0x120 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-NEXT: subl %eax, %edi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 4(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edi), %eax -; X86-NEXT: andl $31, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl 12(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edi), %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl 52(%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 56(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl 44(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 256(%esp,%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %esi, %edi -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: movl 32(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: movl 52(%ebx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 60(%eax) -; X86-NEXT: movl %esi, 56(%eax) -; X86-NEXT: movl %ecx, 52(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 44(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 40(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 36(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 32(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 24(%eax) -; X86-NEXT: movl %ebx, 20(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 16(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 48(%eax) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: reset_eq_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rdx -; SSE-NEXT: movq (%rsp,%rdx), %r9 -; SSE-NEXT: movq 8(%rsp,%rdx), %r8 -; SSE-NEXT: movq %r8, %rsi -; SSE-NEXT: shldq %cl, %r9, %rsi -; SSE-NEXT: movq -8(%rsp,%rdx), %rax -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: movq 16(%rsp,%rdx), %r14 -; SSE-NEXT: movq 24(%rsp,%rdx), %r10 -; SSE-NEXT: movq %r10, %rbx -; SSE-NEXT: shldq %cl, %r14, %rbx -; SSE-NEXT: shldq %cl, %r8, %r14 -; SSE-NEXT: movq 32(%rsp,%rdx), %r13 -; SSE-NEXT: movq 40(%rsp,%rdx), %r12 -; SSE-NEXT: shldq %cl, %r13, %r12 -; SSE-NEXT: shldq %cl, %r10, %r13 -; SSE-NEXT: movq -16(%rsp,%rdx), %rdx -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq %r12, %rbp -; SSE-NEXT: movq %r9, %r15 -; SSE-NEXT: movq %rsi, %r11 -; SSE-NEXT: movq 16(%rdi), %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r13 -; SSE-NEXT: andq %r8, %r9 -; SSE-NEXT: orq %r13, %r9 -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r12 -; SSE-NEXT: movq 24(%rdi), %r10 -; SSE-NEXT: andq %r10, %rsi -; SSE-NEXT: orq %r12, %rsi -; SSE-NEXT: movq %r14, %r13 -; SSE-NEXT: movq 32(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r14 -; SSE-NEXT: movq %rdx, %r12 -; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %rdx -; SSE-NEXT: orq %r14, %rdx -; SSE-NEXT: orq %r9, %rdx -; SSE-NEXT: movq %rbx, %r14 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: andq %rcx, %rbx -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: movq 8(%rdi), %r8 -; SSE-NEXT: andq %r8, %rax -; SSE-NEXT: orq %rbx, %rax -; SSE-NEXT: orq %rsi, %rax -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq %r10, %r11 -; SSE-NEXT: notq %r15 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq %rcx, %r14 -; SSE-NEXT: notq %r13 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; SSE-NEXT: notq %rbp -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE-NEXT: notq %rcx -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; SSE-NEXT: notq %r9 -; SSE-NEXT: andq %r8, %r9 -; SSE-NEXT: notq %r12 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rcx, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r13, 32(%rdi) -; SSE-NEXT: movq %r14, 40(%rdi) -; SSE-NEXT: movq %r15, 16(%rdi) -; SSE-NEXT: movq %r11, 24(%rdi) -; SSE-NEXT: movq %r12, (%rdi) -; SSE-NEXT: movq %r9, 8(%rdi) -; SSE-NEXT: sete %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: reset_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: pushq %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rdx -; AVX2-NEXT: movq -48(%rsp,%rdx), %r8 -; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx -; AVX2-NEXT: movq %rbx, %rax -; AVX2-NEXT: shldq %cl, %r8, %rax -; AVX2-NEXT: movq -16(%rsp,%rdx), %r10 -; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi -; AVX2-NEXT: shldq %cl, %r10, %rsi -; AVX2-NEXT: movq -32(%rsp,%rdx), %r11 -; AVX2-NEXT: movq -24(%rsp,%rdx), %r14 -; AVX2-NEXT: movq %r14, %r9 -; AVX2-NEXT: shldq %cl, %r11, %r9 -; AVX2-NEXT: movq -64(%rsp,%rdx), %r15 -; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx -; AVX2-NEXT: shldq %cl, %rdx, %r8 -; AVX2-NEXT: shldq %cl, %r14, %r10 -; AVX2-NEXT: shldq %cl, %rbx, %r11 -; AVX2-NEXT: shldq %cl, %r15, %rdx -; AVX2-NEXT: shlxq %rcx, %r15, %rcx -; AVX2-NEXT: movq 24(%rdi), %rbx -; AVX2-NEXT: movq 56(%rdi), %r14 -; AVX2-NEXT: movq 16(%rdi), %r15 -; AVX2-NEXT: movq 48(%rdi), %r13 -; AVX2-NEXT: movq 32(%rdi), %rbp -; AVX2-NEXT: andnq %rbp, %r11, %r12 -; AVX2-NEXT: andq %r11, %rbp -; AVX2-NEXT: andnq %r13, %r10, %r11 -; AVX2-NEXT: andq %r10, %r13 -; AVX2-NEXT: andnq %r15, %r8, %r10 -; AVX2-NEXT: andq %r8, %r15 -; AVX2-NEXT: movq 40(%rdi), %r8 -; AVX2-NEXT: orq %r13, %r15 -; AVX2-NEXT: andnq %r8, %r9, %r13 -; AVX2-NEXT: andq %r9, %r8 -; AVX2-NEXT: andnq %r14, %rsi, %r9 -; AVX2-NEXT: andq %rsi, %r14 -; AVX2-NEXT: andnq %rbx, %rax, %rsi -; AVX2-NEXT: andq %rax, %rbx -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: orq %r14, %rbx -; AVX2-NEXT: andnq %rax, %rcx, %r14 -; AVX2-NEXT: andq %rcx, %rax -; AVX2-NEXT: orq %rbp, %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: orq %r15, %rax -; AVX2-NEXT: andnq %rcx, %rdx, %r15 -; AVX2-NEXT: andq %rdx, %rcx -; AVX2-NEXT: orq %r8, %rcx -; AVX2-NEXT: orq %rbx, %rcx -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: movq %r11, 48(%rdi) -; AVX2-NEXT: movq %r9, 56(%rdi) -; AVX2-NEXT: movq %r12, 32(%rdi) -; AVX2-NEXT: movq %r13, 40(%rdi) -; AVX2-NEXT: movq %r10, 16(%rdi) -; AVX2-NEXT: movq %rsi, 24(%rdi) -; AVX2-NEXT: movq %r14, (%rdi) -; AVX2-NEXT: movq %r15, 8(%rdi) -; AVX2-NEXT: sete %al -; AVX2-NEXT: addq $8, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: reset_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: pushq %rax -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq -48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: shldq %cl, %r8, %rax -; AVX512-NEXT: movq -16(%rsp,%rbx), %r10 -; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi -; AVX512-NEXT: shldq %cl, %r10, %rsi -; AVX512-NEXT: movq -32(%rsp,%rbx), %r11 -; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 -; AVX512-NEXT: movq %r15, %r9 -; AVX512-NEXT: shldq %cl, %r11, %r9 -; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx -; AVX512-NEXT: shldq %cl, %rdx, %r8 -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: shldq %cl, %r14, %r11 -; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx -; AVX512-NEXT: shldq %cl, %rbx, %rdx -; AVX512-NEXT: shlxq %rcx, %rbx, %rcx -; AVX512-NEXT: movq 24(%rdi), %rbx -; AVX512-NEXT: movq 56(%rdi), %r14 -; AVX512-NEXT: movq 16(%rdi), %r15 -; AVX512-NEXT: movq 48(%rdi), %r13 -; AVX512-NEXT: movq 32(%rdi), %rbp -; AVX512-NEXT: andnq %rbp, %r11, %r12 -; AVX512-NEXT: andq %r11, %rbp -; AVX512-NEXT: andnq %r13, %r10, %r11 -; AVX512-NEXT: andq %r10, %r13 -; AVX512-NEXT: andnq %r15, %r8, %r10 -; AVX512-NEXT: andq %r8, %r15 -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: orq %r13, %r15 -; AVX512-NEXT: andnq %r8, %r9, %r13 -; AVX512-NEXT: andq %r9, %r8 -; AVX512-NEXT: andnq %r14, %rsi, %r9 -; AVX512-NEXT: andq %rsi, %r14 -; AVX512-NEXT: andnq %rbx, %rax, %rsi -; AVX512-NEXT: andq %rax, %rbx -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: orq %r14, %rbx -; AVX512-NEXT: andnq %rax, %rcx, %r14 -; AVX512-NEXT: andq %rcx, %rax -; AVX512-NEXT: orq %rbp, %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: orq %r15, %rax -; AVX512-NEXT: andnq %rcx, %rdx, %r15 -; AVX512-NEXT: andq %rdx, %rcx -; AVX512-NEXT: orq %r8, %rcx -; AVX512-NEXT: orq %rbx, %rcx -; AVX512-NEXT: orq %rax, %rcx -; AVX512-NEXT: movq %r11, 48(%rdi) -; AVX512-NEXT: movq %r9, 56(%rdi) -; AVX512-NEXT: movq %r12, 32(%rdi) -; AVX512-NEXT: movq %r13, 40(%rdi) -; AVX512-NEXT: movq %r10, 16(%rdi) -; AVX512-NEXT: movq %rsi, 24(%rdi) -; AVX512-NEXT: movq %r14, (%rdi) -; AVX512-NEXT: movq %r15, 8(%rdi) -; AVX512-NEXT: sete %al -; AVX512-NEXT: addq $8, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %mask = xor i512 %bit, -1 - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = and i512 %ld, %mask - %cmp = icmp eq i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: set_ne_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $272, %esp # imm = 0x110 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl 56(%edx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl 24(%edx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 12(%eax), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl 60(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 28(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 240(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 32(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 16(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl %esi, %edi -; X86-NEXT: movl 52(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl %ebx, 60(%edx) -; X86-NEXT: movl %edi, 56(%edx) -; X86-NEXT: movl %ecx, 52(%edx) -; X86-NEXT: movl %esi, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 4(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: set_ne_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq (%rsp,%rbx), %rsi -; SSE-NEXT: movq 8(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 32(%rsp,%rbx), %r8 -; SSE-NEXT: movq 40(%rsp,%rbx), %rbp -; SSE-NEXT: shldq %cl, %r8, %rbp -; SSE-NEXT: movq 16(%rsp,%rbx), %r9 -; SSE-NEXT: movq 24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r10 -; SSE-NEXT: shldq %cl, %r9, %r10 -; SSE-NEXT: movq -8(%rsp,%rbx), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -16(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: movq 24(%rdi), %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 16(%rdi), %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %r8, %r13 -; SSE-NEXT: andq %rsi, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %rcx, %r13 -; SSE-NEXT: andq %rbp, %r13 -; SSE-NEXT: andq %rax, %r15 -; SSE-NEXT: orq %r13, %r15 -; SSE-NEXT: movq 32(%rdi), %r14 -; SSE-NEXT: movq %r14, %rcx -; SSE-NEXT: andq %r9, %rcx -; SSE-NEXT: movq (%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rbx, %r13 -; SSE-NEXT: orq %rcx, %r13 -; SSE-NEXT: orq %r12, %r13 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r12 -; SSE-NEXT: andq %r10, %r12 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: andq %r11, %rax -; SSE-NEXT: orq %r12, %rax -; SSE-NEXT: orq %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: orq %rcx, %r10 -; SSE-NEXT: orq %r14, %r9 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: movq %r8, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r9, 32(%rdi) -; SSE-NEXT: movq %r10, 40(%rdi) -; SSE-NEXT: movq %rsi, 16(%rdi) -; SSE-NEXT: movq %r15, 24(%rdi) -; SSE-NEXT: movq %rbx, (%rdi) -; SSE-NEXT: movq %r11, 8(%rdi) -; SSE-NEXT: setne %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: set_ne_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, (%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rbx -; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX2-NEXT: movq %rbp, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX2-NEXT: shldq %cl, %r8, %r13 -; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: shldq %cl, %r9, %r10 -; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX2-NEXT: shldq %cl, %r11, %rsi -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r8, %r14 -; AVX2-NEXT: andq %rsi, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq 56(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r13, %r15 -; AVX2-NEXT: movq 24(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %rax, %r14 -; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq (%rsp,%rbx), %rdx -; AVX2-NEXT: movq 32(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r9, %r15 -; AVX2-NEXT: shlxq %rcx, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq (%rdi), %rbx -; AVX2-NEXT: movq %rbx, %rbp -; AVX2-NEXT: andq %rax, %rbp -; AVX2-NEXT: orq %r15, %rbp -; AVX2-NEXT: orq %r12, %rbp -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: andq %r10, %rcx -; AVX2-NEXT: movq 8(%rdi), %r15 -; AVX2-NEXT: movq %r15, %r12 -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: orq %rcx, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX2-NEXT: orq %rax, %r10 -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT: orq %r15, %r11 -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: movq %r8, 48(%rdi) -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r9, 32(%rdi) -; AVX2-NEXT: movq %r10, 40(%rdi) -; AVX2-NEXT: movq %rsi, 16(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %rbx, (%rdi) -; AVX2-NEXT: movq %r11, 8(%rdi) -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $72, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: set_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $72, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, (%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX512-NEXT: movq %rbp, %rax -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX512-NEXT: shldq %cl, %r8, %r13 -; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %r10 -; AVX512-NEXT: shldq %cl, %r9, %r10 -; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX512-NEXT: shldq %cl, %r11, %rsi -; AVX512-NEXT: shldq %cl, %r14, %r8 -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r8, %r14 -; AVX512-NEXT: andq %rsi, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq 56(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r13, %r15 -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %rax, %r14 -; AVX512-NEXT: orq %r15, %r14 -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: movq (%rsp,%rbx), %rdx -; AVX512-NEXT: movq 32(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r9, %r15 -; AVX512-NEXT: shlxq %rcx, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq (%rdi), %rbx -; AVX512-NEXT: movq %rbx, %rbp -; AVX512-NEXT: andq %rax, %rbp -; AVX512-NEXT: orq %r15, %rbp -; AVX512-NEXT: orq %r12, %rbp -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rdx, %r11 -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andq %r10, %rcx -; AVX512-NEXT: movq 8(%rdi), %r15 -; AVX512-NEXT: movq %r15, %r12 -; AVX512-NEXT: andq %r11, %r12 -; AVX512-NEXT: orq %rcx, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: orq %rax, %r10 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: orq %r15, %r11 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: movq %r8, 48(%rdi) -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r9, 32(%rdi) -; AVX512-NEXT: movq %r10, 40(%rdi) -; AVX512-NEXT: movq %rsi, 16(%rdi) -; AVX512-NEXT: movq %rcx, 24(%rdi) -; AVX512-NEXT: movq %rbx, (%rdi) -; AVX512-NEXT: movq %r11, 8(%rdi) -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $72, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = or i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { -; X86-LABEL: init_eq_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $432, %esp # imm = 0x1B0 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %edx, %esi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%esi), %eax -; X86-NEXT: movl 48(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%esi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl 16(%ebp), %ebx -; X86-NEXT: movzbl %bl, %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%ebx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl 56(%edi), %ebx -; X86-NEXT: movl 60(%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 52(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 48(%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl 40(%edi), %ebx -; X86-NEXT: movl 44(%edi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 32(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 28(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 24(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 20(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 16(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 12(%edi), %eax -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 8(%edi), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%edi), %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl (%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 60(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 56(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 52(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 44(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 40(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 36(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 32(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 24(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 20(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 16(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %esi, 48(%eax) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: init_eq_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $216, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %r10 -; SSE-NEXT: movq 184(%rsp,%r10), %r11 -; SSE-NEXT: movq 192(%rsp,%r10), %rsi -; SSE-NEXT: movq %rsi, %r13 -; SSE-NEXT: shldq %cl, %r11, %r13 -; SSE-NEXT: movq 200(%rsp,%r10), %r15 -; SSE-NEXT: shldq %cl, %rsi, %r15 -; SSE-NEXT: movq 168(%rsp,%r10), %rbx -; SSE-NEXT: movq 176(%rsp,%r10), %rsi -; SSE-NEXT: movq %rsi, %r14 -; SSE-NEXT: shldq %cl, %rbx, %r14 -; SSE-NEXT: shldq %cl, %rsi, %r11 -; SSE-NEXT: movq 152(%rsp,%r10), %rax -; SSE-NEXT: movq 160(%rsp,%r10), %r8 -; SSE-NEXT: movq %r8, %r12 -; SSE-NEXT: shldq %cl, %rax, %r12 -; SSE-NEXT: shldq %cl, %r8, %rbx -; SSE-NEXT: movq 144(%rsp,%r10), %r9 -; SSE-NEXT: movq %r9, %r8 -; SSE-NEXT: shlq %cl, %r8 -; SSE-NEXT: shldq %cl, %r9, %rax -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movl %edx, %edx -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, (%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq 16(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rsi, %r13 -; SSE-NEXT: andq %rdx, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %r15, %rsi -; SSE-NEXT: movq 56(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r15 -; SSE-NEXT: movq %rbx, %r13 -; SSE-NEXT: movq 24(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %rbx -; SSE-NEXT: orq %r15, %rbx -; SSE-NEXT: movq %r14, %rbp -; SSE-NEXT: movq 32(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r14 -; SSE-NEXT: movq %r8, %r15 -; SSE-NEXT: movq (%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r8 -; SSE-NEXT: orq %r14, %r8 -; SSE-NEXT: orq %r12, %r8 -; SSE-NEXT: movq %r11, %r12 -; SSE-NEXT: movq 40(%rdi), %r9 -; SSE-NEXT: andq %r9, %r11 -; SSE-NEXT: movq %rax, %r14 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %rax -; SSE-NEXT: orq %r11, %rax -; SSE-NEXT: orq %rbx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: notq %rax -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq 56(%rsp,%r10), %r11 -; SSE-NEXT: movq 64(%rsp,%r10), %rax -; SSE-NEXT: movq %rax, %rbx -; SSE-NEXT: shldq %cl, %r11, %rbx -; SSE-NEXT: orq %rbx, %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: notq %rsi -; SSE-NEXT: movq 72(%rsp,%r10), %rbx -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: orq %rbx, %rsi -; SSE-NEXT: notq %rbp -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: movq 40(%rsp,%r10), %rax -; SSE-NEXT: movq 48(%rsp,%r10), %rdx -; SSE-NEXT: movq %rdx, %rbx -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: orq %rbx, %rbp -; SSE-NEXT: notq %r12 -; SSE-NEXT: andq %r9, %r12 -; SSE-NEXT: shldq %cl, %rdx, %r11 -; SSE-NEXT: movq 24(%rsp,%r10), %r9 -; SSE-NEXT: movq 32(%rsp,%r10), %rdx -; SSE-NEXT: movq %rdx, %rbx -; SSE-NEXT: shldq %cl, %r9, %rbx -; SSE-NEXT: orq %r11, %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: orq %rbx, %r11 -; SSE-NEXT: notq %r13 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; SSE-NEXT: orq %rax, %r13 -; SSE-NEXT: notq %r15 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: movq 16(%rsp,%r10), %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: orq %rdx, %r15 -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: orq %r9, %r14 -; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq %rax, 48(%rdi) -; SSE-NEXT: movq %rsi, 56(%rdi) -; SSE-NEXT: movq %rbp, 32(%rdi) -; SSE-NEXT: movq %r12, 40(%rdi) -; SSE-NEXT: movq %r11, 16(%rdi) -; SSE-NEXT: movq %r13, 24(%rdi) -; SSE-NEXT: movq %r15, (%rdi) -; SSE-NEXT: movq %r14, 8(%rdi) -; SSE-NEXT: sete %al -; SSE-NEXT: addq $216, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: init_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $200, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %r8d -; AVX2-NEXT: andl $63, %r8d -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rsi -; AVX2-NEXT: movq 144(%rsp,%rsi), %r11 -; AVX2-NEXT: movq 152(%rsp,%rsi), %r12 -; AVX2-NEXT: movq %r12, %r10 -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %r11, %r10 -; AVX2-NEXT: movq 176(%rsp,%rsi), %r14 -; AVX2-NEXT: movq 184(%rsp,%rsi), %r9 -; AVX2-NEXT: shldq %cl, %r14, %r9 -; AVX2-NEXT: movq 160(%rsp,%rsi), %r15 -; AVX2-NEXT: movq 168(%rsp,%rsi), %r13 -; AVX2-NEXT: movq %r13, %rbx -; AVX2-NEXT: shldq %cl, %r15, %rbx -; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp -; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 136(%rsp,%rsi), %rax -; AVX2-NEXT: shldq %cl, %rax, %r11 -; AVX2-NEXT: shldq %cl, %r13, %r14 -; AVX2-NEXT: shldq %cl, %r12, %r15 -; AVX2-NEXT: shldq %cl, %rbp, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rdx, (%rsp) -; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq 48(%rdi), %rbp -; AVX2-NEXT: movq 32(%rdi), %r13 -; AVX2-NEXT: andnq %r13, %r15, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r15, %r13 -; AVX2-NEXT: andnq %rbp, %r14, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r14, %rbp -; AVX2-NEXT: andnq %r12, %r11, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: andnq %rax, %rbx, %rcx -; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq %rax, %rbp -; AVX2-NEXT: andq %rbx, %rbp -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: andnq %rcx, %r9, %rbx -; AVX2-NEXT: andq %r9, %rcx -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: andnq %rax, %r10, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r10, %rax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: movq (%rdi), %r10 -; AVX2-NEXT: andnq %r10, %rcx, %r15 -; AVX2-NEXT: andq %rcx, %r10 -; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx -; AVX2-NEXT: movq 48(%rsp,%rsi), %r11 -; AVX2-NEXT: movq %r11, %r9 -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %rdx, %r9 -; AVX2-NEXT: orq %r13, %r10 -; AVX2-NEXT: orq %r12, %r10 -; AVX2-NEXT: movq 8(%rdi), %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andnq %r13, %rcx, %r12 -; AVX2-NEXT: andq %rcx, %r13 -; AVX2-NEXT: orq %rbp, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq 56(%rsp,%rsi), %rax -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %r11, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: orq %r9, %r14 -; AVX2-NEXT: orq %rax, %rbx -; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 24(%rsp,%rsi), %rax -; AVX2-NEXT: movq 32(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, %r11 -; AVX2-NEXT: shldq %cl, %rax, %r11 -; AVX2-NEXT: shldq %cl, %r9, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX2-NEXT: orq %r11, %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: orq %rdx, %rbx -; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx -; AVX2-NEXT: movq 16(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, %r11 -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: shldq %cl, %r9, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: orq %r11, %r9 -; AVX2-NEXT: movq (%rsp,%rsi), %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: shlxq %r8, %rsi, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: orq %rax, %r15 -; AVX2-NEXT: orq %rdx, %r12 -; AVX2-NEXT: orq %r10, %r13 -; AVX2-NEXT: movq %r14, 48(%rdi) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: movq %rax, 56(%rdi) -; AVX2-NEXT: movq %rbp, 32(%rdi) -; AVX2-NEXT: movq %rbx, 40(%rdi) -; AVX2-NEXT: movq %r9, 16(%rdi) -; AVX2-NEXT: movq %r11, 24(%rdi) -; AVX2-NEXT: movq %r15, (%rdi) -; AVX2-NEXT: movq %r12, 8(%rdi) -; AVX2-NEXT: sete %al -; AVX2-NEXT: addq $200, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $184, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rsi -; AVX512-NEXT: movq 128(%rsp,%rsi), %r10 -; AVX512-NEXT: movq 136(%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rax -; AVX512-NEXT: shldq %cl, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 160(%rsp,%rsi), %r14 -; AVX512-NEXT: movq 168(%rsp,%rsi), %rax -; AVX512-NEXT: shldq %cl, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 144(%rsp,%rsi), %r15 -; AVX512-NEXT: movq 152(%rsp,%rsi), %r11 -; AVX512-NEXT: movq %r11, %rbx -; AVX512-NEXT: shldq %cl, %r15, %rbx -; AVX512-NEXT: movq 120(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %r10 -; AVX512-NEXT: shldq %cl, %r11, %r14 -; AVX512-NEXT: movq %rdi, %r9 -; AVX512-NEXT: movq 112(%rsp,%rsi), %r11 -; AVX512-NEXT: shldq %cl, %r12, %r15 -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq 48(%rdi), %r13 -; AVX512-NEXT: movq 32(%rdi), %rbp -; AVX512-NEXT: andnq %rbp, %r15, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r15, %rbp -; AVX512-NEXT: andnq %r13, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r14, %r13 -; AVX512-NEXT: andnq %r12, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r10, %r12 -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: orq %r13, %r12 -; AVX512-NEXT: andnq %r8, %rbx, %rdi -; AVX512-NEXT: andq %rbx, %r8 -; AVX512-NEXT: movq 56(%r9), %r13 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: andnq %r13, %rdx, %r10 -; AVX512-NEXT: andq %rdx, %r13 -; AVX512-NEXT: movq 24(%r9), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: andnq %rax, %rdx, %r15 -; AVX512-NEXT: andq %rdx, %rax -; AVX512-NEXT: orq %r13, %rax -; AVX512-NEXT: shlxq %rcx, %r11, %r13 -; AVX512-NEXT: movq (%r9), %rdx -; AVX512-NEXT: andnq %rdx, %r13, %r14 -; AVX512-NEXT: andq %r13, %rdx -; AVX512-NEXT: orq %rbp, %rdx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r11, %rbp -; AVX512-NEXT: orq %r12, %rdx -; AVX512-NEXT: movq 8(%r9), %r13 -; AVX512-NEXT: andnq %r13, %rbp, %rbx -; AVX512-NEXT: andq %rbp, %r13 -; AVX512-NEXT: orq %r8, %r13 -; AVX512-NEXT: movq 24(%rsp,%rsi), %r8 -; AVX512-NEXT: orq %rax, %r13 -; AVX512-NEXT: movq 32(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, %r12 -; AVX512-NEXT: shldq %cl, %r8, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: orq %r12, %r11 -; AVX512-NEXT: movq 40(%rsp,%rsi), %r12 -; AVX512-NEXT: shldq %cl, %rax, %r12 -; AVX512-NEXT: orq %r12, %r10 -; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 8(%rsp,%rsi), %rax -; AVX512-NEXT: movq 16(%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rbp -; AVX512-NEXT: shldq %cl, %rax, %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: orq %rbp, %r10 -; AVX512-NEXT: shldq %cl, %r12, %r8 -; AVX512-NEXT: orq %r8, %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq -8(%rsp,%rsi), %r8 -; AVX512-NEXT: movq (%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rbp -; AVX512-NEXT: shldq %cl, %r8, %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: orq %rbp, %rdi -; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi -; AVX512-NEXT: shldq %cl, %r12, %rax -; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: shlxq %rcx, %rsi, %rax -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rsi, %r8 -; AVX512-NEXT: orq %rax, %r14 -; AVX512-NEXT: orq %r8, %rbx -; AVX512-NEXT: orq %rdx, %r13 -; AVX512-NEXT: movq %r11, 48(%r9) -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, 56(%r9) -; AVX512-NEXT: movq %r10, 32(%r9) -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, 40(%r9) -; AVX512-NEXT: movq %rdi, 16(%r9) -; AVX512-NEXT: movq %r15, 24(%r9) -; AVX512-NEXT: movq %r14, (%r9) -; AVX512-NEXT: movq %rbx, 8(%r9) -; AVX512-NEXT: sete %al -; AVX512-NEXT: addq $184, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %mask = xor i512 %bit, -1 - %val0 = zext i1 %value to i512 - %val = shl nuw i512 %val0, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res0 = and i512 %ld, %mask - %res = or i512 %res0, %val - %cmp = icmp eq i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -; i4096 - -define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { -; X86-LABEL: test_ne_i4096: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $1792, %esp # imm = 0x700 -; X86-NEXT: movl 12(%ebp), %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: andl $508, %ecx # imm = 0x1FC -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 248(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 252(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 504(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 508(%esi), %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 120(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 124(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 376(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 380(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 184(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 188(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 440(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 444(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 312(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 316(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 216(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 220(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 472(%esi), %edi -; X86-NEXT: movl 476(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 88(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 92(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 344(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 348(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 152(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 156(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 408(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 412(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 280(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 284(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 232(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 236(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 488(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 492(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 104(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 108(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 360(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 364(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 168(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 172(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 424(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 428(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 296(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 300(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 200(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 204(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 456(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 460(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 72(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 76(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 328(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 332(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 136(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 140(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 392(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 396(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 264(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 268(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 240(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 244(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 496(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 500(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 112(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 116(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 368(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 372(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 176(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 180(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 432(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 436(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 304(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 308(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 208(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 212(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 464(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 468(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 80(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 84(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 336(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 340(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 144(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 148(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 400(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 404(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 272(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 276(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 224(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 228(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 480(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 484(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 100(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 352(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 356(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 160(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 164(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 416(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 420(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 288(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 292(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 192(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 196(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 448(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 452(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 64(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 320(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 324(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 128(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 132(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl 256(%esi), %edi -; X86-NEXT: movl 260(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 388(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 4(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $1, %eax, %edi -; X86-NEXT: shrl %eax -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: notb %cl -; X86-NEXT: shrdl %cl, %eax, %edi -; X86-NEXT: shrl %cl, %ebx -; X86-NEXT: movb $32, %cl -; X86-NEXT: testb %cl, %cl -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: jne .LBB20_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: .LBB20_2: -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shll %cl, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 320(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 64(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 448(%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 192(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 288(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 32(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 416(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 160(%eax), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 352(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 96(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 480(%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 224(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: orl %edi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 272(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 16(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 400(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 144(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 336(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 80(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 464(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 208(%eax), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 304(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 48(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 432(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 176(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 368(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 112(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 496(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: andl 240(%eax), %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 264(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 8(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 392(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 136(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 328(%ebx), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 72(%ebx), %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 456(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 200(%ebx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 296(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 40(%ebx), %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 424(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 168(%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 360(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 104(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 488(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 232(%ebx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 280(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 24(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 408(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 152(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 344(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 88(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 472(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 216(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 312(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 56(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 440(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 184(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 376(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 120(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 504(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 248(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 324(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 68(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 452(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 196(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 292(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 36(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 420(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 164(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 356(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 100(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 484(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 228(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 276(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 20(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 404(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 148(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 340(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 84(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 468(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 212(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 308(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 52(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 436(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 180(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 372(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 116(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 500(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 244(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 268(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 12(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 396(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 140(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 332(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 76(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 460(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 204(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 300(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 44(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 428(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 172(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 364(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 108(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 492(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 236(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 284(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 28(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 412(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 156(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 348(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 92(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 476(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 220(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 316(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 60(%ebx), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 444(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 188(%ebx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 380(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 124(%ebx), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 508(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: andl 252(%esi), %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: negl %ecx -; X86-NEXT: movl 1648(%esp,%ecx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 128(%edx), %ecx -; X86-NEXT: andl 384(%edx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edx), %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 256(%edx), %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 260(%edx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 4(%edx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 132(%edx), %eax -; X86-NEXT: andl 388(%edx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: test_ne_i4096: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $1576, %rsp # imm = 0x628 -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: andl $4032, %eax # imm = 0xFC0 -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: negl %eax -; SSE-NEXT: movslq %eax, %rsi -; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1304(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1560(%rsp,%rsi), %rax -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1176(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1432(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1240(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1496(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1112(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; SSE-NEXT: movq 1368(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1272(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1528(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1144(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1400(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1208(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1464(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1080(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1336(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1288(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1544(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1160(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1416(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1216(%rsp,%rsi), %r11 -; SSE-NEXT: movq 1224(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %r11, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1480(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1088(%rsp,%rsi), %r9 -; SSE-NEXT: movq 1096(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %r9, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1352(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1248(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1512(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1120(%rsp,%rsi), %rax -; SSE-NEXT: movq 1128(%rsp,%rsi), %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rax, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1376(%rsp,%rsi), %r13 -; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx -; SSE-NEXT: movq %rbx, %r8 -; SSE-NEXT: shldq %cl, %r13, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1192(%rsp,%rsi), %r15 -; SSE-NEXT: movq %r15, %r14 -; SSE-NEXT: shldq %cl, %rdx, %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1440(%rsp,%rsi), %r10 -; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, %r14 -; SSE-NEXT: shldq %cl, %r10, %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1312(%rsp,%rsi), %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp -; SSE-NEXT: movq %rbp, %r12 -; SSE-NEXT: shldq %cl, %r14, %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx -; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE-NEXT: shldq %cl, %rbp, %r14 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r9 -; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rbp -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r13 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r12, %r15 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r12, %r10 -; SSE-NEXT: andq 384(%rdi), %r10 -; SSE-NEXT: andq 128(%rdi), %r15 -; SSE-NEXT: andq 320(%rdi), %r13 -; SSE-NEXT: andq 64(%rdi), %rax -; SSE-NEXT: orq %r10, %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: andq 448(%rdi), %r9 -; SSE-NEXT: andq 192(%rdi), %rbp -; SSE-NEXT: orq %r9, %rbp -; SSE-NEXT: orq %rax, %rbp -; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq 288(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 32(%rdi), %r9 -; SSE-NEXT: andq 416(%rdi), %rdx -; SSE-NEXT: andq 160(%rdi), %r11 -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 352(%rdi), %rdx -; SSE-NEXT: orq %r9, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 96(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 480(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 224(%rdi), %r8 -; SSE-NEXT: orq %rax, %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq 272(%rdi), %r14 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 16(%rdi), %rax -; SSE-NEXT: orq %r14, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 400(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 144(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 336(%rdi), %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 80(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 464(%rdi), %rdx -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 208(%rdi), %r11 -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: orq %rax, %r11 -; SSE-NEXT: orq %r8, %r11 -; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload -; SSE-NEXT: andq 304(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 48(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 432(%rdi), %r9 -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 176(%rdi), %r8 -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 368(%rdi), %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 112(%rdi), %rax -; SSE-NEXT: orq %r10, %r8 -; SSE-NEXT: movq %r8, %r10 -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 496(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE-NEXT: andq 240(%rdi), %rbp -; SSE-NEXT: orq %r8, %rbp -; SSE-NEXT: orq %rax, %rbp -; SSE-NEXT: orq %r10, %rbp -; SSE-NEXT: orq %r11, %rbp -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 392(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: andq 136(%rdi), %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 328(%rdi), %rdx -; SSE-NEXT: orq %rax, %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 72(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 456(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE-NEXT: andq 200(%rdi), %r13 -; SSE-NEXT: orq %rax, %r13 -; SSE-NEXT: orq %rdx, %r13 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 296(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 40(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 424(%rdi), %r8 -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 168(%rdi), %rdx -; SSE-NEXT: orq %r8, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 360(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 104(%rdi), %rax -; SSE-NEXT: orq %r9, %rdx -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 488(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: andq 232(%rdi), %r15 -; SSE-NEXT: orq %rax, %r15 -; SSE-NEXT: orq %r8, %r15 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 280(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 24(%rdi), %rax -; SSE-NEXT: orq %rdx, %r15 -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 408(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 152(%rdi), %rax -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 344(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 88(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 472(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE-NEXT: andq 216(%rdi), %r14 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: orq %rax, %r14 -; SSE-NEXT: orq %r8, %r14 -; SSE-NEXT: orq %r10, %r14 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 312(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 440(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 184(%rdi), %r9 -; SSE-NEXT: orq %r11, %r10 -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: orq %r10, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: andq 376(%rdi), %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 120(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 504(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 248(%rdi), %r8 -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: movq 1056(%rsp,%rsi), %rax -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: orq %r10, %r8 -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: andq 256(%rdi), %rdx -; SSE-NEXT: orq %r14, %r8 -; SSE-NEXT: andq (%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq %rbp, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE-NEXT: andq 264(%rdi), %rcx -; SSE-NEXT: andq 8(%rdi), %rbx -; SSE-NEXT: orq %rcx, %rbx -; SSE-NEXT: orq %r12, %rbx -; SSE-NEXT: orq %r13, %rbx -; SSE-NEXT: orq %r15, %rbx -; SSE-NEXT: orq %r8, %rbx -; SSE-NEXT: orq %rax, %rbx -; SSE-NEXT: setne %al -; SSE-NEXT: addq $1576, %rsp # imm = 0x628 -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: test_ne_i4096: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: movslq %eax, %rsi -; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11 -; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %r11, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12 -; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %r12, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp -; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax +; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: movl %esi, %ebx +; AVX2-NEXT: shrl $3, %ebx +; AVX2-NEXT: movl %ebx, %eax +; AVX2-NEXT: andl $56, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: movslq %eax, %r11 +; AVX2-NEXT: movq 128(%rsp,%r11), %r15 +; AVX2-NEXT: movq 136(%rsp,%r11), %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shldq %cl, %r15, %rsi +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 120(%rsp,%r11), %r8 +; AVX2-NEXT: shldq %cl, %r8, %r15 +; AVX2-NEXT: movq 144(%rsp,%r11), %r14 +; AVX2-NEXT: movq 152(%rsp,%r11), %rsi +; AVX2-NEXT: movq %rsi, %r9 +; AVX2-NEXT: shldq %cl, %r14, %r9 +; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: shldq %cl, %rax, %r14 +; AVX2-NEXT: movq 112(%rsp,%r11), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 160(%rsp,%r11), %r13 +; AVX2-NEXT: movq 168(%rsp,%r11), %r12 +; AVX2-NEXT: shldq %cl, %r13, %r12 +; AVX2-NEXT: shldq %cl, %rsi, %r13 +; AVX2-NEXT: shldq %cl, %rax, %r8 +; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl %edx, %eax +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movq 24(%rsp,%r11), %rbp +; AVX2-NEXT: movq 32(%rsp,%r11), %rdx +; AVX2-NEXT: movq %rdx, %rax ; AVX2-NEXT: shldq %cl, %rbp, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax -; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10 -; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8 -; AVX2-NEXT: movq %r8, %rdx -; AVX2-NEXT: shldq %cl, %r10, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx -; AVX2-NEXT: movq %rbx, %rdx -; AVX2-NEXT: shldq %cl, %r9, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9 -; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, %r14 -; AVX2-NEXT: shldq %cl, %r9, %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14 -; AVX2-NEXT: movq %r14, %r13 -; AVX2-NEXT: shldq %cl, %r15, %r13 -; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx -; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, %r9 -; AVX2-NEXT: andq 384(%rdi), %r9 -; AVX2-NEXT: andq 128(%rdi), %r14 -; AVX2-NEXT: andq 320(%rdi), %r10 -; AVX2-NEXT: orq %r9, %r14 -; AVX2-NEXT: movq %r14, %r15 -; AVX2-NEXT: andq 64(%rdi), %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: andq 448(%rdi), %rbp -; AVX2-NEXT: andq 192(%rdi), %r13 -; AVX2-NEXT: orq %rbp, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq 288(%rdi), %r8 -; AVX2-NEXT: andq 32(%rdi), %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 416(%rdi), %rax -; AVX2-NEXT: orq %r8, %r12 -; AVX2-NEXT: andq 160(%rdi), %r11 -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: andq 352(%rdi), %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 96(%rdi), %rax -; AVX2-NEXT: orq %r12, %r11 -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 480(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: andq 224(%rdi), %r13 -; AVX2-NEXT: orq %r10, %r13 +; AVX2-NEXT: movq 40(%rsp,%r11), %r10 +; AVX2-NEXT: shldq %cl, %rdx, %r10 +; AVX2-NEXT: movq 8(%rsp,%r11), %r9 +; AVX2-NEXT: movq 16(%rsp,%r11), %rdx +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: shldq %cl, %r9, %r8 +; AVX2-NEXT: shldq %cl, %rdx, %rbp +; AVX2-NEXT: andnq 48(%rdi), %r13, %r13 ; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 272(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 16(%rdi), %rax -; AVX2-NEXT: orq %r11, %r13 -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 400(%rdi), %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 144(%rdi), %rax -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 336(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 80(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 464(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 208(%rdi), %r11 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: orq %r8, %r11 -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: orq %r9, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 304(%rdi), %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 48(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 432(%rdi), %r10 -; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload -; AVX2-NEXT: andq 176(%rdi), %rax -; AVX2-NEXT: orq %r9, %r8 -; AVX2-NEXT: movq %r8, %r9 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 368(%rdi), %r8 -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 112(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 496(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 240(%rdi), %r9 -; AVX2-NEXT: orq %r8, %r9 -; AVX2-NEXT: orq %rax, %r9 -; AVX2-NEXT: orq %r10, %r9 -; AVX2-NEXT: orq %r11, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 392(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX2-NEXT: andq 136(%rdi), %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 328(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 72(%rdi), %rax -; AVX2-NEXT: orq %r10, %rbp -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 456(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX2-NEXT: andq 200(%rdi), %r12 -; AVX2-NEXT: orq %rax, %r12 -; AVX2-NEXT: orq %r8, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 296(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 40(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 424(%rdi), %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 168(%rdi), %rax -; AVX2-NEXT: orq %r10, %r8 -; AVX2-NEXT: movq %r8, %r10 -; AVX2-NEXT: orq %r11, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 360(%rdi), %r8 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 104(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 488(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: andq 232(%rdi), %r14 -; AVX2-NEXT: orq %rax, %r14 +; AVX2-NEXT: movq -8(%rsp,%r11), %rax +; AVX2-NEXT: movq (%rsp,%r11), %rdx +; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: shldq %cl, %rax, %rsi +; AVX2-NEXT: shldq %cl, %rdx, %r9 +; AVX2-NEXT: andnq 56(%rdi), %r12, %r12 +; AVX2-NEXT: andnq 32(%rdi), %r14, %r14 +; AVX2-NEXT: orq %r10, %r12 ; AVX2-NEXT: orq %r8, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 280(%rdi), %r8 -; AVX2-NEXT: orq %r10, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 24(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 408(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 152(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 344(%rdi), %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 88(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 472(%rdi), %rax -; AVX2-NEXT: orq %r11, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: andq 216(%rdi), %rbx -; AVX2-NEXT: orq %rax, %rbx -; AVX2-NEXT: orq %r8, %rbx -; AVX2-NEXT: orq %r10, %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 312(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 56(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 440(%rdi), %r10 -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 184(%rdi), %r8 -; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: andnq 40(%rdi), %rdx, %rdx +; AVX2-NEXT: orq %rbp, %rdx +; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX2-NEXT: movq -16(%rsp,%r11), %r10 +; AVX2-NEXT: shlxq %rcx, %r10, %r11 +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %r10, %rax +; AVX2-NEXT: andnq 16(%rdi), %r15, %rcx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 376(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 120(%rdi), %rax -; AVX2-NEXT: orq %r11, %r8 -; AVX2-NEXT: movq %r8, %r11 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 504(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 248(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: andnq 24(%rdi), %r10, %r10 +; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: orq %r9, %r10 +; AVX2-NEXT: andnq (%rdi), %r8, %rsi +; AVX2-NEXT: orq %r11, %rsi ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r8, %r10 -; AVX2-NEXT: orq %r11, %rax -; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: shlxq %rcx, %rsi, %rax -; AVX2-NEXT: andq 256(%rdi), %r10 -; AVX2-NEXT: andq (%rdi), %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: orq %r15, %rax -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX2-NEXT: orq %r13, %rax -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andq 264(%rdi), %rcx -; AVX2-NEXT: andq 8(%rdi), %rdx -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: orq %rbp, %rdx -; AVX2-NEXT: orq %r12, %rdx -; AVX2-NEXT: orq %r14, %rdx -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX2-NEXT: andnq 8(%rdi), %r8, %r8 +; AVX2-NEXT: orq %rax, %r8 +; AVX2-NEXT: andl $60, %ebx +; AVX2-NEXT: movl (%rdi,%rbx), %eax +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload +; AVX2-NEXT: btl %r9d, %eax +; AVX2-NEXT: movq %r13, 48(%rdi) +; AVX2-NEXT: movq %r12, 56(%rdi) +; AVX2-NEXT: movq %r14, 32(%rdi) +; AVX2-NEXT: movq %rdx, 40(%rdi) +; AVX2-NEXT: movq %rcx, 16(%rdi) +; AVX2-NEXT: movq %r10, 24(%rdi) +; AVX2-NEXT: movq %rsi, (%rdi) +; AVX2-NEXT: movq %r8, 8(%rdi) +; AVX2-NEXT: setae %al +; AVX2-NEXT: addq $184, %rsp ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 ; AVX2-NEXT: popq %r13 @@ -6518,7 +1532,7 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_ne_i4096: +; AVX512-LABEL: init_eq_i512: ; AVX512: # %bb.0: ; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %r15 @@ -6526,489 +1540,102 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0 +; AVX512-NEXT: subq $168, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: movl %esi, %ecx ; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %eax -; AVX512-NEXT: negl %eax -; AVX512-NEXT: movslq %eax, %rsi -; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10 -; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14 -; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax +; AVX512-NEXT: movl %esi, %r10d +; AVX512-NEXT: shrl $3, %r10d +; AVX512-NEXT: movl %r10d, %r8d +; AVX512-NEXT: andl $56, %r8d +; AVX512-NEXT: negl %r8d +; AVX512-NEXT: movslq %r8d, %r9 +; AVX512-NEXT: movq 112(%rsp,%r9), %r11 +; AVX512-NEXT: movq 120(%rsp,%r9), %r14 +; AVX512-NEXT: movq %r14, %rax +; AVX512-NEXT: shldq %cl, %r11, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq 104(%rsp,%r9), %rax +; AVX512-NEXT: shldq %cl, %rax, %r11 +; AVX512-NEXT: movq 128(%rsp,%r9), %r15 +; AVX512-NEXT: movq 136(%rsp,%r9), %rbp +; AVX512-NEXT: movq %rbp, %rbx +; AVX512-NEXT: shldq %cl, %r15, %rbx +; AVX512-NEXT: shldq %cl, %r14, %r15 +; AVX512-NEXT: movq 144(%rsp,%r9), %r13 +; AVX512-NEXT: movq 152(%rsp,%r9), %r12 +; AVX512-NEXT: shldq %cl, %r13, %r12 +; AVX512-NEXT: movq 96(%rsp,%r9), %r14 +; AVX512-NEXT: shldq %cl, %rbp, %r13 ; AVX512-NEXT: shldq %cl, %r14, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12 -; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r12, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax -; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl %edx, %edx +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: movq 8(%rsp,%r9), %r8 +; AVX512-NEXT: movq 16(%rsp,%r9), %rax +; AVX512-NEXT: movq %rax, %rbp +; AVX512-NEXT: shldq %cl, %r8, %rbp +; AVX512-NEXT: andnq 48(%rdi), %r13, %r13 +; AVX512-NEXT: orq %rbp, %r13 +; AVX512-NEXT: movq 24(%rsp,%r9), %rbp +; AVX512-NEXT: shldq %cl, %rax, %rbp +; AVX512-NEXT: movq -8(%rsp,%r9), %rax +; AVX512-NEXT: movq (%rsp,%r9), %rsi +; AVX512-NEXT: movq %rsi, %rdx ; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11 -; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx -; AVX512-NEXT: movq %rbx, %rdx -; AVX512-NEXT: shldq %cl, %r11, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8 -; AVX512-NEXT: movq %r8, %rdx -; AVX512-NEXT: shldq %cl, %r9, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9 -; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, %r15 -; AVX512-NEXT: shldq %cl, %r9, %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp -; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15 -; AVX512-NEXT: movq %r15, %r13 -; AVX512-NEXT: shldq %cl, %rbp, %r13 -; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx -; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r13 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbp, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: andq 384(%rdi), %r9 -; AVX512-NEXT: andq 128(%rdi), %r15 -; AVX512-NEXT: orq %r9, %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq 320(%rdi), %r11 -; AVX512-NEXT: andq 64(%rdi), %rax -; AVX512-NEXT: orq %r11, %rax -; AVX512-NEXT: andq 448(%rdi), %r12 -; AVX512-NEXT: andq 192(%rdi), %r13 -; AVX512-NEXT: orq %r12, %r13 -; AVX512-NEXT: orq %rax, %r13 -; AVX512-NEXT: andq 288(%rdi), %r8 -; AVX512-NEXT: andq 32(%rdi), %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 416(%rdi), %rax -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: andq 160(%rdi), %r10 -; AVX512-NEXT: orq %rax, %r10 -; AVX512-NEXT: andq 352(%rdi), %rbx -; AVX512-NEXT: orq %r14, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 96(%rdi), %rax -; AVX512-NEXT: orq %rbx, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 480(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: andq 224(%rdi), %r15 -; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: orq %r8, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 272(%rdi), %r8 -; AVX512-NEXT: orq %r10, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 16(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 400(%rdi), %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 144(%rdi), %rax -; AVX512-NEXT: orq %r9, %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 336(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 80(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 464(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: andq 208(%rdi), %r11 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: orq %r8, %r11 -; AVX512-NEXT: orq %rax, %r11 -; AVX512-NEXT: orq %r9, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 304(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 48(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 432(%rdi), %r9 -; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload -; AVX512-NEXT: andq 176(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: orq %r9, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 368(%rdi), %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 112(%rdi), %rax -; AVX512-NEXT: orq %r10, %r8 -; AVX512-NEXT: movq %r8, %r10 -; AVX512-NEXT: orq %r9, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 496(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 240(%rdi), %r9 -; AVX512-NEXT: orq %r8, %r9 -; AVX512-NEXT: orq %rax, %r9 -; AVX512-NEXT: orq %r10, %r9 -; AVX512-NEXT: orq %r11, %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 392(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: andq 136(%rdi), %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 328(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 72(%rdi), %rax -; AVX512-NEXT: orq %r10, %rbp -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 456(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX512-NEXT: andq 200(%rdi), %r12 -; AVX512-NEXT: orq %rax, %r12 -; AVX512-NEXT: orq %r8, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 296(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 40(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 424(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 168(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 360(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 104(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 488(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX512-NEXT: andq 232(%rdi), %r14 -; AVX512-NEXT: orq %rax, %r14 -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: orq %r10, %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 280(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 24(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 408(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 152(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: andq 344(%rdi), %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 88(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 472(%rdi), %rax -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: andq 216(%rdi), %rbx -; AVX512-NEXT: orq %rax, %rbx +; AVX512-NEXT: andnq 56(%rdi), %r12, %r12 +; AVX512-NEXT: orq %rbp, %r12 +; AVX512-NEXT: andnq 32(%rdi), %r15, %r15 +; AVX512-NEXT: orq %rdx, %r15 +; AVX512-NEXT: shldq %cl, %rsi, %r8 +; AVX512-NEXT: movq -24(%rsp,%r9), %rdx +; AVX512-NEXT: movq -16(%rsp,%r9), %rsi +; AVX512-NEXT: movq %rsi, %rbp +; AVX512-NEXT: shldq %cl, %rdx, %rbp +; AVX512-NEXT: andnq 40(%rdi), %rbx, %rbx ; AVX512-NEXT: orq %r8, %rbx -; AVX512-NEXT: orq %r10, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 312(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 56(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 440(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 184(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 376(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 120(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 504(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 248(%rdi), %r8 -; AVX512-NEXT: orq %rax, %r8 -; AVX512-NEXT: orq %r10, %r8 -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax +; AVX512-NEXT: andnq 16(%rdi), %r11, %r8 +; AVX512-NEXT: orq %rbp, %r8 +; AVX512-NEXT: shlxq %rcx, %r14, %r11 +; AVX512-NEXT: movq -32(%rsp,%r9), %r9 +; AVX512-NEXT: shldq %cl, %rsi, %rax ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rsi, %r10 -; AVX512-NEXT: orq %rbx, %r8 -; AVX512-NEXT: shlxq %rcx, %rax, %rsi -; AVX512-NEXT: andq 256(%rdi), %r10 -; AVX512-NEXT: andq (%rdi), %rsi -; AVX512-NEXT: orq %r10, %rsi -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: orq %r13, %rsi -; AVX512-NEXT: orq %r15, %rsi +; AVX512-NEXT: andnq 24(%rdi), %rsi, %rsi +; AVX512-NEXT: orq %rax, %rsi +; AVX512-NEXT: shlxq %rcx, %r9, %rax ; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: orq %r9, %rsi +; AVX512-NEXT: shldq %cl, %r9, %rdx +; AVX512-NEXT: andnq (%rdi), %r11, %rcx +; AVX512-NEXT: orq %rax, %rcx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 264(%rdi), %rax -; AVX512-NEXT: andq 8(%rdi), %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: orq %rbp, %rdx -; AVX512-NEXT: orq %r12, %rdx -; AVX512-NEXT: orq %r14, %rdx -; AVX512-NEXT: orq %r8, %rdx -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX512-NEXT: andnq 8(%rdi), %rax, %rax +; AVX512-NEXT: orq %rdx, %rax +; AVX512-NEXT: andl $60, %r10d +; AVX512-NEXT: movl (%rdi,%r10), %edx +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload +; AVX512-NEXT: btl %r9d, %edx +; AVX512-NEXT: movq %r13, 48(%rdi) +; AVX512-NEXT: movq %r12, 56(%rdi) +; AVX512-NEXT: movq %r15, 32(%rdi) +; AVX512-NEXT: movq %rbx, 40(%rdi) +; AVX512-NEXT: movq %r8, 16(%rdi) +; AVX512-NEXT: movq %rsi, 24(%rdi) +; AVX512-NEXT: movq %rcx, (%rdi) +; AVX512-NEXT: movq %rax, 8(%rdi) +; AVX512-NEXT: setae %al +; AVX512-NEXT: addq $168, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 ; AVX512-NEXT: popq %r13 @@ -7017,6 +1644,45 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { ; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %mask = xor i512 %bit, -1 + %val0 = zext i1 %value to i512 + %val = shl nuw i512 %val0, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res0 = and i512 %ld, %mask + %res = or i512 %res0, %val + %cmp = icmp eq i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +; i4096 + +define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { +; X86-LABEL: test_ne_i4096: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $4064, %edx # imm = 0xFE0 +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_ne_i4096: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $4064, %eax # imm = 0xFE0 +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq %rem = and i32 %position, 4095 %ofs = zext nneg i32 %rem to i4096 %bit = shl nuw i4096 1, %ofs @@ -7161,8 +1827,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: subl $64, %esp +; X86-NEXT: movl 12(%ebp), %ecx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -7176,51 +1842,33 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %esi -; X86-NEXT: movl 60(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %edi -; X86-NEXT: movl 52(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl 8(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: movl (%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl 12(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl 4(%ebx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl 40(%esp,%eax), %edx +; X86-NEXT: movl 44(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl 32(%esp,%eax), %edi +; X86-NEXT: movl 36(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: shldl %cl, %edi, %ebx ; X86-NEXT: notl %ebx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl 16(%ebp), %eax ; X86-NEXT: movl (%eax), %eax -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %ebx, 8(%esi) -; X86-NEXT: movl %ecx, 12(%esi) -; X86-NEXT: movl %edi, (%esi) -; X86-NEXT: movl %edx, 4(%esi) -; X86-NEXT: je .LBB22_2 +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: andl %ebx, 4(%eax) +; X86-NEXT: shll %cl, %edi +; X86-NEXT: notl %edi +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: andl $96, %ebx +; X86-NEXT: shrl $3, %ebx +; X86-NEXT: movl (%eax,%ebx), %ebx +; X86-NEXT: andl %edi, (%eax) +; X86-NEXT: notl %esi +; X86-NEXT: andl %esi, 12(%eax) +; X86-NEXT: notl %edx +; X86-NEXT: andl %edx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: btl %ecx, %ebx +; X86-NEXT: jae .LBB22_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB22_2: @@ -7242,52 +1890,75 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; SSE-NEXT: testb $64, %cl ; SSE-NEXT: cmovneq %rsi, %r8 ; SSE-NEXT: cmovneq %rax, %rsi -; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: movq 8(%rdi), %r9 -; SSE-NEXT: movq %r9, %r10 -; SSE-NEXT: andq %r8, %r10 ; SSE-NEXT: notq %r8 -; SSE-NEXT: movq %rcx, %r11 -; SSE-NEXT: andq %rsi, %r11 ; SSE-NEXT: notq %rsi -; SSE-NEXT: andq %r9, %r8 -; SSE-NEXT: andq %rcx, %rsi -; SSE-NEXT: orq %r10, %r11 -; SSE-NEXT: jne .LBB22_2 +; SSE-NEXT: movl %ecx, %r9d +; SSE-NEXT: andl $96, %r9d +; SSE-NEXT: shrl $3, %r9d +; SSE-NEXT: movl (%rdi,%r9), %r9d +; SSE-NEXT: btl %ecx, %r9d +; SSE-NEXT: jb .LBB22_2 ; SSE-NEXT: # %bb.1: ; SSE-NEXT: movl (%rdx), %eax ; SSE-NEXT: .LBB22_2: -; SSE-NEXT: movq %rsi, (%rdi) -; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: andq %rsi, (%rdi) +; SSE-NEXT: andq %r8, 8(%rdi) ; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: retq ; -; AVX-LABEL: reset_multiload_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: movl $1, %esi -; AVX-NEXT: xorl %r8d, %r8d -; AVX-NEXT: shldq %cl, %rsi, %r8 -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: shlxq %rcx, %rsi, %r9 -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %r9, %r8 -; AVX-NEXT: cmovneq %rax, %r9 -; AVX-NEXT: movq (%rdi), %r10 -; AVX-NEXT: movq 8(%rdi), %r11 -; AVX-NEXT: andnq %r11, %r8, %rcx -; AVX-NEXT: andq %r8, %r11 -; AVX-NEXT: andnq %r10, %r9, %rsi -; AVX-NEXT: andq %r9, %r10 -; AVX-NEXT: orq %r11, %r10 -; AVX-NEXT: jne .LBB22_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl (%rdx), %eax -; AVX-NEXT: .LBB22_2: -; AVX-NEXT: movq %rsi, (%rdi) -; AVX-NEXT: movq %rcx, 8(%rdi) -; AVX-NEXT: # kill: def $eax killed $eax killed $rax -; AVX-NEXT: retq +; AVX2-LABEL: reset_multiload_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: movl $1, %r8d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %r8, %rsi +; AVX2-NEXT: shlxq %rcx, %r8, %r8 +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %r8, %rsi +; AVX2-NEXT: cmovneq %rax, %r8 +; AVX2-NEXT: notq %rsi +; AVX2-NEXT: notq %r8 +; AVX2-NEXT: movl %ecx, %r9d +; AVX2-NEXT: andl $96, %r9d +; AVX2-NEXT: shrl $3, %r9d +; AVX2-NEXT: movl (%rdi,%r9), %r9d +; AVX2-NEXT: btl %ecx, %r9d +; AVX2-NEXT: jb .LBB22_2 +; AVX2-NEXT: # %bb.1: +; AVX2-NEXT: movl (%rdx), %eax +; AVX2-NEXT: .LBB22_2: +; AVX2-NEXT: andq %r8, (%rdi) +; AVX2-NEXT: andq %rsi, 8(%rdi) +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: reset_multiload_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl $1, %r8d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %r8, %rsi +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: shlxq %rcx, %r8, %r8 +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %r8, %rsi +; AVX512-NEXT: cmovneq %rax, %r8 +; AVX512-NEXT: notq %rsi +; AVX512-NEXT: notq %r8 +; AVX512-NEXT: movl %ecx, %r9d +; AVX512-NEXT: andl $96, %r9d +; AVX512-NEXT: shrl $3, %r9d +; AVX512-NEXT: movl (%rdi,%r9), %r9d +; AVX512-NEXT: btl %ecx, %r9d +; AVX512-NEXT: jb .LBB22_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: movl (%rdx), %eax +; AVX512-NEXT: .LBB22_2: +; AVX512-NEXT: andq %r8, (%rdi) +; AVX512-NEXT: andq %rsi, 8(%rdi) +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs From ce925820d8a3ebc082a920d6cd23a40adefa0c5b Mon Sep 17 00:00:00 2001 From: Florian Hahn <flo@fhahn.com> Date: Mon, 3 Nov 2025 16:24:20 +0000 Subject: [PATCH 028/313] [VPlan] Use operands() driectly in VPInstruction::clone() (NFC). There's no need to create temporary SmallVectors. --- llvm/lib/Transforms/Vectorize/VPlan.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e1da070a1fb7f..cfe1f1e9d7528 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1110,9 +1110,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, VP_CLASSOF_IMPL(VPDef::VPInstructionSC) VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); - auto *New = - new VPInstruction(Opcode, Operands, *this, *this, getDebugLoc(), Name); + auto *New = new VPInstruction(Opcode, operands(), *this, *this, + getDebugLoc(), Name); if (getUnderlyingValue()) New->setUnderlyingValue(getUnderlyingInstr()); return New; @@ -1226,10 +1225,9 @@ class VPInstructionWithType : public VPInstruction { } VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); auto *New = - new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this, - getDebugLoc(), getName()); + new VPInstructionWithType(getOpcode(), operands(), getResultType(), + *this, getDebugLoc(), getName()); New->setUnderlyingValue(getUnderlyingValue()); return New; } From af68efc9c49383c65b6f2bc800ea40b06b66b983 Mon Sep 17 00:00:00 2001 From: Robert Imschweiler <robert.imschweiler@amd.com> Date: Mon, 3 Nov 2025 17:33:20 +0100 Subject: [PATCH 029/313] Revert "[AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for callbr instruction with inline-asm" (#166186) Reverts llvm/llvm-project#152161 Need to revert to fix changed logic for the expensive checks. --- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 89 +++--- llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 19 +- llvm/test/CodeGen/AMDGPU/callbr.ll | 54 ---- ...nify-divergent-exit-nodes-with-musttail.ll | 51 ---- llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 257 ++---------------- .../si-annotate-nested-control-flows.ll | 100 +------ .../si-unify-exit-multiple-unreachables.ll | 161 +---------- llvm/test/CodeGen/AMDGPU/update-phi.ll | 39 --- llvm/test/Transforms/StructurizeCFG/callbr.ll | 235 ---------------- 9 files changed, 79 insertions(+), 926 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/callbr.ll delete mode 100644 llvm/test/Transforms/StructurizeCFG/callbr.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 706237b906cc3..ddf9a24eb5230 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -181,52 +181,14 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( return NewRetBlock; } -static BasicBlock * -createDummyReturnBlock(Function &F, - SmallVector<BasicBlock *, 4> &ReturningBlocks) { - BasicBlock *DummyReturnBB = - BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); - Type *RetTy = F.getReturnType(); - Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); - ReturningBlocks.push_back(DummyReturnBB); - return DummyReturnBB; -} - -/// Handle conditional branch instructions (-> 2 targets) and callbr -/// instructions with N targets. -static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI, - BasicBlock *DummyReturnBB, - std::vector<DominatorTree::UpdateType> &Updates) { - SmallVector<BasicBlock *, 2> Successors(successors(BB)); - - // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - - Updates.reserve(Updates.size() + 2 * Successors.size() + 2); - - // 'Successors' become successors of TransitionBB instead of BB, - // and TransitionBB becomes a single successor of BB. - Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); - for (BasicBlock *Successor : Successors) { - Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); - Updates.emplace_back(DominatorTree::Delete, BB, Successor); - } - - // Create a branch that will always branch to the transition block and - // references DummyReturnBB. - BB->getTerminator()->eraseFromParent(); - BranchInst::Create(TransitionBB, DummyReturnBB, - ConstantInt::getTrue(F.getContext()), BB); - Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); -} - bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { + assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); + if (PDT.root_size() == 0 || (PDT.root_size() == 1 && - !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator()))) + !isa<BranchInst>(PDT.getRoot()->getTerminator()))) return false; // Loop over all of the blocks in a function, tracking all of the blocks that @@ -260,27 +222,46 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, if (HasDivergentExitBlock) UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - if (!DummyReturnBB) - DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); + + ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); + if (DummyReturnBB == nullptr) { + DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + } if (BI->isUnconditional()) { BasicBlock *LoopHeaderBB = BI->getSuccessor(0); BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. - BranchInst::Create(LoopHeaderBB, DummyReturnBB, - ConstantInt::getTrue(F.getContext()), BB); + BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); + } else { // Conditional branch. + SmallVector<BasicBlock *, 2> Successors(successors(BB)); + + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); + + Updates.reserve(Updates.size() + 2 * Successors.size() + 2); + + // 'Successors' become successors of TransitionBB instead of BB, + // and TransitionBB becomes a single successor of BB. + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); + for (BasicBlock *Successor : Successors) { + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); + } + + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); + BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); - } else { - handleNBranch(F, BB, BI, DummyReturnBB, Updates); } Changed = true; - } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) { - if (!DummyReturnBB) - DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); - - handleNBranch(F, BB, CBI, DummyReturnBB, Updates); - } else { - llvm_unreachable("unsupported block terminator"); } } diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 0a8f5ea2fdae1..5f6f66a4bc213 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -558,10 +558,11 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { } else { // Test for successors as back edge BasicBlock *BB = N->getNodeAs<BasicBlock>(); - if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator())) - for (BasicBlock *Succ : Term->successors()) - if (Visited.count(Succ)) - Loops[Succ] = BB; + BranchInst *Term = cast<BranchInst>(BB->getTerminator()); + + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) + Loops[Succ] = BB; } } @@ -593,7 +594,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { for (BasicBlock *P : predecessors(BB)) { // Ignore it if it's a branch from outside into our region entry - if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator())) + if (!ParentRegion->contains(P)) continue; Region *R = RI->getRegionFor(P); @@ -1401,17 +1402,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { /// Run the transformation for each region found bool StructurizeCFG::run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI) { - // CallBr and its corresponding direct target blocks are for now ignored by - // this pass. This is not a limitation for the currently intended uses cases - // of callbr in the AMDGPU backend. - // Parent and child regions are not affected by this (current) restriction. - // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details. - if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator())) + if (R->isTopLevelRegion()) return false; this->DT = DT; this->TTI = TTI; Func = R->getEntry()->getParent(); + assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); ParentRegion = R; diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll deleted file mode 100644 index 253a6ec100eae..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/callbr.ll +++ /dev/null @@ -1,54 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s - -define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) { -; CHECK-LABEL: callbr_inline_asm: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dword v0, v[0:1] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; %bb.1: ; %fallthrough -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_dword v[2:3], v0 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] -; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target -; CHECK-NEXT: ; %indirect -; CHECK-NEXT: ; Label of block must be emitted -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_dword v[4:5], v0 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] - %a = load i32, ptr %src, align 4 - callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect] -fallthrough: - store i32 %a, ptr %dst1, align 4 - br label %ret -indirect: - store i32 %a, ptr %dst2, align 4 - br label %ret -ret: - ret void -} - -define void @callbr_self_loop(i1 %c) { -; CHECK-LABEL: callbr_self_loop: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: .LBB1_1: ; %callbr -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_branch .LBB1_1 -; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target -; CHECK-NEXT: ; %callbr.target.ret -; CHECK-NEXT: ; Label of block must be emitted -; CHECK-NEXT: s_setpc_b64 s[30:31] - br label %callbr -callbr: - callbr void asm "", "!i"() to label %callbr [label %ret] -ret: - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll index 076a99ff8588f..007e3f0a6bdbc 100644 --- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll +++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll @@ -3,7 +3,6 @@ declare void @foo(ptr) declare i1 @bar(ptr) -declare i32 @bar32(ptr) define void @musttail_call_without_return_value(ptr %p) { ; CHECK-LABEL: define void @musttail_call_without_return_value( @@ -29,31 +28,6 @@ bb.1: ret void } -define void @musttail_call_without_return_value_callbr(ptr %p) { -; CHECK-LABEL: define void @musttail_call_without_return_value_callbr( -; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 -; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) -; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] -; CHECK: [[BB_0]]: -; CHECK-NEXT: musttail call void @foo(ptr [[P]]) -; CHECK-NEXT: ret void -; CHECK: [[BB_1:.*:]] -; CHECK-NEXT: ret void -; -entry: - %load = load i32, ptr %p, align 1 - callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] - -bb.0: - musttail call void @foo(ptr %p) - ret void - -bb.1: - ret void -} - define i1 @musttail_call_with_return_value(ptr %p) { ; CHECK-LABEL: define i1 @musttail_call_with_return_value( ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { @@ -77,28 +51,3 @@ bb.0: bb.1: ret i1 %load } - -define i32 @musttail_call_with_return_value_callbr(ptr %p) { -; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr( -; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 -; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) -; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] -; CHECK: [[BB_0]]: -; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]]) -; CHECK-NEXT: ret i32 [[RET]] -; CHECK: [[BB_1:.*:]] -; CHECK-NEXT: ret i32 [[LOAD]] -; -entry: - %load = load i32, ptr %p, align 1 - callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] - -bb.0: - %ret = musttail call i32 @bar32(ptr %p) - ret i32 %ret - -bb.1: - ret i32 %load -} diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index df635925b87df..3e2e43faca5aa 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -36,60 +36,26 @@ loop: br label %loop } -define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) { -; SI-LABEL: infinite_loop_callbr: -; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: ;;#ASMSTART -; SI-NEXT: ;;#ASMEND -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_endpgm -; IR-LABEL: @infinite_loop_callbr( -; IR-NEXT: entry: -; IR-NEXT: callbr void asm "", ""() -; IR-NEXT: to label [[LOOP:%.*]] [] -; IR: loop: -; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 -; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] -; IR: TransitionBlock: -; IR-NEXT: callbr void asm "", ""() -; IR-NEXT: to label [[LOOP]] [] -; IR: DummyReturnBlock: -; IR-NEXT: ret void -; -entry: - callbr void asm "", ""() to label %loop [] - -loop: - store volatile i32 999, ptr addrspace(1) %out, align 4 - callbr void asm "", ""() to label %loop [] -} - define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB2_3 +; SI-NEXT: s_cbranch_execz .LBB1_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB2_2: ; %loop +; SI-NEXT: .LBB1_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB2_2 -; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock +; SI-NEXT: s_cbranch_vccnz .LBB1_2 +; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_ret( ; IR-NEXT: entry: @@ -115,93 +81,44 @@ return: ret void } -define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) { -; SI-LABEL: infinite_loop_ret_callbr: -; SI: ; %bb.0: ; %entry -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: ;;#ASMSTART -; SI-NEXT: ;;#ASMEND -; SI-NEXT: ; %bb.1: ; %loop.preheader -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: .LBB3_2: ; Inline asm indirect target -; SI-NEXT: ; %UnifiedReturnBlock -; SI-NEXT: ; Label of block must be emitted -; SI-NEXT: s_endpgm -; IR-LABEL: @infinite_loop_ret_callbr( -; IR-NEXT: entry: -; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1 -; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32 -; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]]) -; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock] -; IR: loop: -; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 -; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] -; IR: TransitionBlock: -; IR-NEXT: callbr void asm "", ""() -; IR-NEXT: to label [[LOOP]] [] -; IR: UnifiedReturnBlock: -; IR-NEXT: ret void -; -entry: - %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() - %cond = icmp eq i32 %tmp, 1 - %cond32 = zext i1 %cond to i32 - callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return] - -loop: - store volatile i32 999, ptr addrspace(1) %out, align 4 - callbr void asm "", ""() to label %loop [] - -return: - ret void -} - define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: s_cbranch_scc1 .LBB4_4 +; SI-NEXT: s_cbranch_scc1 .LBB2_4 ; SI-NEXT: ; %bb.1: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x378 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB4_2: ; %loop2 +; SI-NEXT: .LBB2_2: ; %loop2 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB4_2 +; SI-NEXT: s_cbranch_vccnz .LBB2_2 ; SI-NEXT: ; %bb.3: ; %Flow ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB4_4: ; %Flow2 +; SI-NEXT: .LBB2_4: ; %Flow2 ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB4_7 +; SI-NEXT: s_cbranch_vccz .LBB2_7 ; SI-NEXT: ; %bb.5: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, 0 -; SI-NEXT: .LBB4_6: ; %loop1 +; SI-NEXT: .LBB2_6: ; %loop1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB4_6 -; SI-NEXT: .LBB4_7: ; %DummyReturnBlock +; SI-NEXT: s_cbranch_vccz .LBB2_6 +; SI-NEXT: .LBB2_7: ; %DummyReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loops( ; IR-NEXT: entry: @@ -227,78 +144,24 @@ loop2: br label %loop2 } -define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) { -; SI-LABEL: infinite_loops_callbr: -; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: ;;#ASMSTART -; SI-NEXT: ;;#ASMEND -; SI-NEXT: ; %bb.1: ; %loop1 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_endpgm -; SI-NEXT: .LBB5_2: ; Inline asm indirect target -; SI-NEXT: ; %loop2.preheader -; SI-NEXT: ; Label of block must be emitted -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, 0x378 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_endpgm -; IR-LABEL: @infinite_loops_callbr( -; IR-NEXT: entry: -; IR-NEXT: callbr void asm "", "r,!i"(i32 poison) -; IR-NEXT: to label [[LOOP1:%.*]] [label %loop2] -; IR: loop1: -; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 -; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] -; IR: TransitionBlock: -; IR-NEXT: callbr void asm "", ""() -; IR-NEXT: to label [[LOOP1]] [] -; IR: loop2: -; IR-NEXT: store volatile i32 888, ptr addrspace(1) [[OUT]], align 4 -; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]] -; IR: TransitionBlock1: -; IR-NEXT: callbr void asm "", ""() -; IR-NEXT: to label [[LOOP2:%.*]] [] -; IR: DummyReturnBlock: -; IR-NEXT: ret void -; -entry: - callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2] - -loop1: - store volatile i32 999, ptr addrspace(1) %out, align 4 - callbr void asm "", ""() to label %loop1 [] - -loop2: - store volatile i32 888, ptr addrspace(1) %out, align 4 - callbr void asm "", ""() to label %loop2 [] -} - define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_nest_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB6_5 +; SI-NEXT: s_cbranch_execz .LBB3_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT: .LBB6_2: ; %outer_loop +; SI-NEXT: .LBB3_2: ; %outer_loop ; SI-NEXT: ; =>This Loop Header: Depth=1 -; SI-NEXT: ; Child Loop BB6_3 Depth 2 +; SI-NEXT: ; Child Loop BB3_3 Depth 2 ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB6_3: ; %inner_loop -; SI-NEXT: ; Parent Loop BB6_2 Depth=1 +; SI-NEXT: .LBB3_3: ; %inner_loop +; SI-NEXT: ; Parent Loop BB3_2 Depth=1 ; SI-NEXT: ; => This Inner Loop Header: Depth=2 ; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1] ; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] @@ -306,13 +169,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execnz .LBB6_3 +; SI-NEXT: s_cbranch_execnz .LBB3_3 ; SI-NEXT: ; %bb.4: ; %loop.exit.guard -; SI-NEXT: ; in Loop: Header=BB6_2 Depth=1 +; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: s_branch .LBB6_2 -; SI-NEXT: .LBB6_5: ; %UnifiedReturnBlock +; SI-NEXT: s_branch .LBB3_2 +; SI-NEXT: .LBB3_5: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_nest_ret( ; IR-NEXT: entry: @@ -349,82 +212,4 @@ return: ret void } -define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) { -; SI-LABEL: infinite_loop_nest_ret_callbr: -; SI: ; %bb.0: ; %entry -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: ;;#ASMSTART -; SI-NEXT: ;;#ASMEND -; SI-NEXT: ; %bb.1: ; %outer_loop.preheader -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT: s_and_b64 s[0:1], exec, 0 -; SI-NEXT: s_branch .LBB7_3 -; SI-NEXT: .LBB7_2: ; %loop.exit.guard -; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 -; SI-NEXT: s_and_b64 vcc, exec, s[2:3] -; SI-NEXT: s_cbranch_vccnz .LBB7_5 -; SI-NEXT: .LBB7_3: ; %outer_loop -; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: ;;#ASMSTART -; SI-NEXT: ;;#ASMEND -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: s_mov_b64 vcc, s[0:1] -; SI-NEXT: s_cbranch_vccz .LBB7_2 -; SI-NEXT: ; %bb.4: ; %TransitionBlock.target.outer_loop -; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 -; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: s_branch .LBB7_2 -; SI-NEXT: .LBB7_5: ; Inline asm indirect target -; SI-NEXT: ; %UnifiedReturnBlock -; SI-NEXT: ; Label of block must be emitted -; SI-NEXT: s_endpgm -; IR-LABEL: @infinite_loop_nest_ret_callbr( -; IR-NEXT: entry: -; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() -; IR-NEXT: [[COND1:%.*]] = icmp ne i32 [[TMP]], 1 -; IR-NEXT: [[COND1_32:%.*]] = zext i1 [[COND1]] to i32 -; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND1_32]]) -; IR-NEXT: to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock] -; IR: outer_loop: -; IR-NEXT: callbr void asm "", ""() -; IR-NEXT: to label [[INNER_LOOP:%.*]] [] -; IR: inner_loop: -; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 -; IR-NEXT: [[COND3:%.*]] = icmp eq i32 [[TMP]], 3 -; IR-NEXT: [[COND3_32:%.*]] = zext i1 [[COND3]] to i32 -; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] -; IR: TransitionBlock: -; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND3_32]]) -; IR-NEXT: to label [[INNER_LOOP]] [label %outer_loop] -; IR: UnifiedReturnBlock: -; IR-NEXT: ret void -; -entry: - %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() - %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination - %cond1_32 = zext i1 %cond1 to i32 - callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return] - -outer_loop: - ; %cond2 = icmp eq i32 %tmp, 2 - ; br i1 %cond2, label %outer_loop, label %inner_loop - callbr void asm "", ""() to label %inner_loop [] - -inner_loop: ; preds = %LeafBlock, %LeafBlock1 - store volatile i32 999, ptr addrspace(1) %out, align 4 - %cond3 = icmp eq i32 %tmp, 3 - %cond3_32 = zext i1 %cond3 to i32 - callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop] - -return: - ret void -} - declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 01bcdad3fc220..34de1e48bfb59 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -3,16 +3,15 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA define void @nested_inf_loop(i1 %0, i1 %1) { -; OPT-LABEL: define void @nested_inf_loop( -; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) { -; OPT-NEXT: [[BB:.*:]] -; OPT-NEXT: br label %[[BB1:.*]] -; OPT: [[BB1]]: -; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]] -; OPT-NEXT: br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]] -; OPT: [[INFLOOP]]: -; OPT-NEXT: br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]] -; OPT: [[DUMMYRETURNBLOCK]]: +; OPT-LABEL: @nested_inf_loop( +; OPT-NEXT: BB: +; OPT-NEXT: br label [[BB1:%.*]] +; OPT: BB1: +; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]] +; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]] +; OPT: infloop: +; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]] +; OPT: DummyReturnBlock: ; OPT-NEXT: ret void ; ; ISA-LABEL: nested_inf_loop: @@ -64,84 +63,3 @@ BB4: BB3: br label %BB1 } - -define void @nested_inf_loop_callbr(i32 %0, i32 %1) { -; OPT-LABEL: define void @nested_inf_loop_callbr( -; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) { -; OPT-NEXT: [[BB:.*:]] -; OPT-NEXT: callbr void asm "", ""() -; OPT-NEXT: to label %[[BB1:.*]] [] -; OPT: [[BB1]]: -; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP0]]) -; OPT-NEXT: to label %[[BB3:.*]] [label %BB2] -; OPT: [[BB2:.*:]] -; OPT-NEXT: callbr void asm "", ""() -; OPT-NEXT: to label %[[BB4:.*]] [] -; OPT: [[BB4]]: -; OPT-NEXT: br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]] -; OPT: [[TRANSITIONBLOCK]]: -; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP1]]) -; OPT-NEXT: to label %[[BB3]] [label %BB4] -; OPT: [[BB3]]: -; OPT-NEXT: callbr void asm "", ""() -; OPT-NEXT: to label %[[BB1]] [] -; OPT: [[DUMMYRETURNBLOCK]]: -; OPT-NEXT: ret void -; -; ISA-LABEL: nested_inf_loop_callbr: -; ISA: ; %bb.0: ; %BB -; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ISA-NEXT: ;;#ASMSTART -; ISA-NEXT: ;;#ASMEND -; ISA-NEXT: ; implicit-def: $sgpr6_sgpr7 -; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5 -; ISA-NEXT: .LBB1_1: ; %BB1 -; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 -; ISA-NEXT: ;;#ASMSTART -; ISA-NEXT: ;;#ASMEND -; ISA-NEXT: s_andn2_b64 s[6:7], s[6:7], exec -; ISA-NEXT: s_and_b64 s[8:9], s[4:5], exec -; ISA-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; ISA-NEXT: .LBB1_2: ; %BB3 -; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; ISA-NEXT: ;;#ASMSTART -; ISA-NEXT: ;;#ASMEND -; ISA-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; ISA-NEXT: s_and_b64 s[8:9], s[6:7], exec -; ISA-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; ISA-NEXT: s_branch .LBB1_1 -; ISA-NEXT: .LBB1_3: ; Inline asm indirect target -; ISA-NEXT: ; %BB2 -; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; ISA-NEXT: ; Label of block must be emitted -; ISA-NEXT: ;;#ASMSTART -; ISA-NEXT: ;;#ASMEND -; ISA-NEXT: s_mov_b64 s[6:7], -1 -; ISA-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; ISA-NEXT: s_cbranch_execz .LBB1_5 -; ISA-NEXT: ; %bb.4: ; %TransitionBlock.target.BB3 -; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; ISA-NEXT: s_xor_b64 s[6:7], exec, -1 -; ISA-NEXT: .LBB1_5: ; %loop.exit.guard -; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; ISA-NEXT: s_or_b64 exec, exec, s[8:9] -; ISA-NEXT: s_and_b64 vcc, exec, s[6:7] -; ISA-NEXT: s_mov_b64 s[6:7], 0 -; ISA-NEXT: s_cbranch_vccz .LBB1_2 -; ISA-NEXT: ; %bb.6: ; %DummyReturnBlock -; ISA-NEXT: s_setpc_b64 s[30:31] -BB: - callbr void asm "", ""() to label %BB1 [] - -BB1: - callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2] - -BB2: - callbr void asm "", ""() to label %BB4 [] - -BB4: - callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4] - -BB3: - callbr void asm "", ""() to label %BB1 [] -} diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 004c27971131d..4cbe682cf9f9f 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s declare void @llvm.trap() @@ -70,33 +70,8 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 -; UNIFY-LABEL: @kernel( -; UNIFY-NEXT: entry: -; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 -; UNIFY-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] -; UNIFY: if.then: -; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 -; UNIFY-NEXT: br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]] -; UNIFY: cond.false: -; UNIFY-NEXT: call void @llvm.trap() -; UNIFY-NEXT: unreachable -; UNIFY: if.else: -; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 -; UNIFY-NEXT: br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]] -; UNIFY: if.then3: -; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 -; UNIFY-NEXT: br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]] -; UNIFY: cond.false.i8: -; UNIFY-NEXT: call void @llvm.trap() -; UNIFY-NEXT: unreachable -; UNIFY: if.end6.sink.split: -; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] -; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 -; UNIFY-NEXT: br label [[IF_END6]] -; UNIFY: if.end6: -; UNIFY-NEXT: ret void -; + + entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 @@ -130,129 +105,5 @@ if.end6.sink.split: if.end6: ret void } - -define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { -; CHECK-LABEL: kernel_callbr: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s1, s[8:9], 0x10 -; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmpk_eq_i32 s1, 0x100 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; %bb.1: ; %if.then -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB1_2: ; %if.end6.sink.split -; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8 -; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: v_mov_b32_e32 v1, s0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_store_dword v0, v1, s[2:3] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB1_3: ; Inline asm indirect target -; CHECK-NEXT: ; %UnifiedReturnBlock -; CHECK-NEXT: ; Label of block must be emitted -; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB1_4: ; Inline asm indirect target -; CHECK-NEXT: ; %if.else -; CHECK-NEXT: ; Label of block must be emitted -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; %bb.5: ; %if.then3 -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 -; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_branch .LBB1_2 -; CHECK-NEXT: .LBB1_6: ; Inline asm indirect target -; CHECK-NEXT: ; %cond.false.i8 -; CHECK-NEXT: ; Label of block must be emitted -; CHECK-NEXT: .LBB1_7: ; Inline asm indirect target -; CHECK-NEXT: ; %cond.false -; CHECK-NEXT: ; Label of block must be emitted -; CHECK-NEXT: s_trap 2 -; CHECK-NEXT: ; divergent unreachable -; CHECK-NEXT: s_branch .LBB1_3 -; UNIFY-LABEL: @kernel_callbr( -; UNIFY-NEXT: entry: -; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 -; UNIFY-NEXT: [[CMP32:%.*]] = zext i1 [[CMP]] to i32 -; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP32]]) -; UNIFY-NEXT: to label [[IF_THEN:%.*]] [label %if.else] -; UNIFY: if.then: -; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 -; UNIFY-NEXT: [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32 -; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_32]]) -; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false] -; UNIFY: cond.false: -; UNIFY-NEXT: call void @llvm.trap() -; UNIFY-NEXT: unreachable -; UNIFY: if.else: -; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 -; UNIFY-NEXT: [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32 -; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP2_32]]) -; UNIFY-NEXT: to label [[IF_THEN3:%.*]] [label %if.end6] -; UNIFY: if.then3: -; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 -; UNIFY-NEXT: [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32 -; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]]) -; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8] -; UNIFY: cond.false.i8: -; UNIFY-NEXT: call void @llvm.trap() -; UNIFY-NEXT: unreachable -; UNIFY: if.end6.sink.split: -; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] -; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 -; UNIFY-NEXT: callbr void asm "", ""() -; UNIFY-NEXT: to label [[IF_END6:%.*]] [] -; UNIFY: if.end6: -; UNIFY-NEXT: ret void -; -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %cmp = icmp eq i32 %n, 256 - %cmp32 = zext i1 %cmp to i32 - callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else] - -if.then: - %cmp1 = icmp eq i32 %a, 0 - %cmp1_32 = zext i1 %cmp1 to i32 - callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false] - -cond.false: - call void @llvm.trap() - unreachable - -if.else: - %cmp2 = icmp ult i32 %tid, 10 - %cmp2_32 = zext i1 %cmp2 to i32 - callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6] - -if.then3: - %cmp1.i7 = icmp eq i32 %a, 0 - %cmp1.i7_32 = zext i1 %cmp1.i7 to i32 - callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8] - -cond.false.i8: - call void @llvm.trap() - unreachable - -if.end6.sink.split: - %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid - store i32 %a, ptr addrspace(1) %x1, align 4 - callbr void asm "", ""() to label %if.end6 [] - -if.end6: - ret void -} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; UNIFY: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll index 684dc1a1f0092..50666bee325e8 100644 --- a/llvm/test/CodeGen/AMDGPU/update-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll @@ -37,42 +37,3 @@ n28: ; preds = %.loopexit, %n28 n31: ; preds = ret void } - -define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 { -; IR-LABEL: @_amdgpu_ps_main_callbr( -; IR-NEXT: .entry: -; IR-NEXT: callbr void asm "", ""() -; IR-NEXT: to label [[DOTLOOPEXIT:%.*]] [] -; IR: .loopexit: -; IR-NEXT: callbr void asm "", ""() -; IR-NEXT: to label [[N28:%.*]] [] -; IR: n28: -; IR-NEXT: [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ] -; IR-NEXT: [[N29]] = fadd float [[DOT01]], 1.000000e+00 -; IR-NEXT: [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00 -; IR-NEXT: [[N30_32:%.*]] = zext i1 [[N30]] to i32 -; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]] -; IR: TransitionBlock: -; IR-NEXT: callbr void asm "", "r,!i"(i32 [[N30_32]]) -; IR-NEXT: to label [[DOTLOOPEXIT]] [label %n28] -; IR: n31: -; IR-NEXT: ret void -; IR: DummyReturnBlock: -; IR-NEXT: ret void -; -.entry: - callbr void asm "", ""() to label %.loopexit [] - -.loopexit: ; preds = %n28, %.entry - callbr void asm "", ""() to label %n28 [] - -n28: ; preds = %.loopexit, %n28 - %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ] - %n29 = fadd float %.01, 1.0 - %n30 = fcmp ogt float %n29, 4.000000e+00 - %n30.32 = zext i1 %n30 to i32 - callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28] - -n31: ; preds = - ret void -} diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll deleted file mode 100644 index 42f95194980d4..0000000000000 --- a/llvm/test/Transforms/StructurizeCFG/callbr.ll +++ /dev/null @@ -1,235 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s - -; Structurize as usual, but don't tear callbr and its destination blocks apart. -; -; Note: currently, callbr blocks and their corresponding target blocks -; themselves are not handled by the structurizer.* If the CFG turns out to be -; unstructured at the end, the CFG lowering (si-annotate-control-flow) will -; detect this. For the currently intended use cases of callbr in the context of -; the AMDGPU backend, this is not a limitation (cf. -; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087). -; -; Note 2: while callbr and its targets remain untouched, everything else is -; handled as usual, even if it is nested in a callbr region. -; -; *FIXME: this will be fixed in the future. Callbr can be handled as follows: -; Input IR: -; ``` -; define void @foo_callbr() { -; callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...] -; fallthrough: -; br label %exit -; indirect: -; br label %exit -; ... -; exit: -; ret void -; } -; ``` -; -; Output IR: -; ``` -; define void @foo_callbr() { -; callbr void asm "", "!i"() -; to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...] -; fake.indirect: ; preds = %0 -; br label %Flow -; fake.indirect1: ; preds = %0 -; br label %Flow -; fake.indirect2: ; preds = %0 -; br label %Flow -; ... -; Flow: ; preds = %fallthrough, %fake.indirect[0-N] -; %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ] -; br i1 %1, label %indirect, label %Flow1 -; Flow1: ; preds = %Flow, %indirect -; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ] -; br i1 %2, label %indirect1, label %Flow2 -; Flow2: ; preds = %Flow, %indirect1 -; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ] -; br i1 %2, label %indirect2, label %Flow3 -; ... -; fallthrough: ; preds = %0 -; br label %Flow -; indirect: ; preds = %Flow -; br label %Flow1 -; indirect1: ; preds = %Flow1 -; br label %Flow2 -; indirect2: : preds = %Flow2 -; br label %Flow3 -; ... -; exit: ; preds = %indirectN, %FlowN -; ret void -; } -; ``` -; -; Output IR as ASCII-art: -; %0 -; --------------------- -; | | | | -; v v v v -; f f.i f.i1 f.i2 -; | | | | -; v v v v -; --------------------- -; %Flow -; | \ -; | %indirect -; | / -; %Flow1 -; | \ -; | %indirect1 -; | / -; %Flow2 -; | \ -; | %indirect2 -; | / -; %exit -; - -; Only callbr, nothing to do. -define void @callbr_simple() { -; CHECK-LABEL: define void @callbr_simple() { -; CHECK-NEXT: [[CALLBR:.*:]] -; CHECK-NEXT: callbr void asm "", "!i"() -; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] -; CHECK: [[INDIRECT]]: -; CHECK-NEXT: br label %[[EXIT:.*]] -; CHECK: [[INDIRECT1:.*:]] -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: ret void -; -callbr: - callbr void asm "", "!i"() to label %fallthrough [label %indirect] -fallthrough: - br label %exit -indirect: - br label %exit -exit: - ret void -} - -; Callbr nested in non-callbr: non-callbr is transformed -define void @callbr_in_non_callbr(i1 %c) { -; CHECK-LABEL: define void @callbr_in_non_callbr( -; CHECK-SAME: i1 [[C:%.*]]) { -; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true -; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]] -; CHECK: [[FLOW]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ] -; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]] -; CHECK: [[CALLBR]]: -; CHECK-NEXT: callbr void asm "", "!i"() -; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] -; CHECK: [[INDIRECT]]: -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[INDIRECT1:.*:]] -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[NOCALLBR]]: -; CHECK-NEXT: br label %[[FLOW]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: ret void -; - br i1 %c, label %callbr, label %nocallbr -callbr: - callbr void asm "", "!i"() to label %fallthrough [label %indirect] -fallthrough: - br label %exit -indirect: - br label %exit -nocallbr: - br label %exit -exit: - ret void -} - -; Callbr parent of non-callbr: non-callbr is transformed -define void @non_callbr_in_callbr(i1 %c) { -; CHECK-LABEL: define void @non_callbr_in_callbr( -; CHECK-SAME: i1 [[C:%.*]]) { -; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true -; CHECK-NEXT: callbr void asm "", "!i"() -; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] -; CHECK: [[INDIRECT]]: -; CHECK-NEXT: br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]] -; CHECK: [[FLOW]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ] -; CHECK-NEXT: br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]] -; CHECK: [[FALLTHROUGH1]]: -; CHECK-NEXT: br label %[[FLOW1]] -; CHECK: [[FALLTHROUGH2]]: -; CHECK-NEXT: br label %[[FLOW]] -; CHECK: [[INDIRECT1:.*:]] -; CHECK-NEXT: br label %[[EXIT:.*]] -; CHECK: [[FLOW1]]: -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: ret void -; - callbr void asm "", "!i"() to label %fallthrough [label %indirect] -fallthrough: - br i1 %c, label %fallthrough1, label %fallthrough2 -fallthrough1: - br label %exit -fallthrough2: - br label %exit -indirect: - br label %exit -exit: - ret void -} - -; Callbr surrounded by non-callbr: all three regular branches are handled -; correctly -define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) { -; CHECK-LABEL: define void @callbr_nested_in_non_callbr( -; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) { -; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true -; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]] -; CHECK: [[FLOW3]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ] -; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]] -; CHECK: [[CALLBR]]: -; CHECK-NEXT: callbr void asm "", "!i"() -; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] -; CHECK: [[INDIRECT]]: -; CHECK-NEXT: br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]] -; CHECK: [[FALLTHROUGH1]]: -; CHECK-NEXT: br label %[[FLOW2]] -; CHECK: [[INDIRECT2:.*:]] -; CHECK-NEXT: br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]] -; CHECK: [[INDIRECT1]]: -; CHECK-NEXT: br label %[[FLOW1]] -; CHECK: [[NOCALLBR]]: -; CHECK-NEXT: br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]] -; CHECK: [[NOCALLBR1]]: -; CHECK-NEXT: br label %[[FLOW]] -; CHECK: [[FLOW]]: -; CHECK-NEXT: br label %[[FLOW3]] -; CHECK: [[FLOW1]]: -; CHECK-NEXT: br label %[[RET]] -; CHECK: [[FLOW2]]: -; CHECK-NEXT: br label %[[RET]] -; CHECK: [[RET]]: -; CHECK-NEXT: ret void -; - br i1 %c, label %callbr, label %nocallbr -callbr: - callbr void asm "", "!i"() to label %fallthrough [label %indirect] -fallthrough: - br i1 %d, label %fallthrough1, label %ret -fallthrough1: - br label %ret -indirect: - br i1 %e, label %indirect1, label %ret -indirect1: - br label %ret -nocallbr: - br i1 %f, label %nocallbr1, label %ret -nocallbr1: - br label %ret -ret: - ret void -} From 25ed9231159f4b2d82b0cf0eb36db65c7599df45 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski <jakub@nod-labs.com> Date: Mon, 3 Nov 2025 11:40:18 -0500 Subject: [PATCH 030/313] [ADT] Handle and document multiple matches in StringSwitch (#166177) Specify that the first match is returned and bail out early when processing multiple case values. --- llvm/include/llvm/ADT/StringSwitch.h | 31 ++++++++++++++++--------- llvm/unittests/ADT/StringSwitchTest.cpp | 17 ++++++++++++++ 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h index 5bdbb302a6d75..53ebec1eb3a54 100644 --- a/llvm/include/llvm/ADT/StringSwitch.h +++ b/llvm/include/llvm/ADT/StringSwitch.h @@ -41,6 +41,8 @@ namespace llvm { /// .Cases({"violet", "purple"}, Violet) /// .Default(UnknownColor); /// \endcode +/// +/// When multiple matches are found, the value of the first match is returned. template<typename T, typename R = T> class StringSwitch { /// The string we are matching. @@ -213,23 +215,30 @@ class StringSwitch { [[nodiscard]] operator R() { return DefaultUnreachable(); } private: - // Returns true when `Str` matches the `S` argument, and stores the result. + // Returns true when a match is found. If `Str` matches the `S` argument, + // stores the result. bool CaseImpl(StringLiteral S, T &Value) { - if (!Result && Str == S) { - Result = std::move(Value); + if (Result) return true; - } - return false; + + if (Str != S) + return false; + + Result = std::move(Value); + return true; } - // Returns true when `Str` matches the `S` argument (case-insensitive), and - // stores the result. + // Returns true when a match is found. If `Str` matches the `S` argument + // (case-insensitive), stores the result. bool CaseLowerImpl(StringLiteral S, T &Value) { - if (!Result && Str.equals_insensitive(S)) { - Result = std::move(Value); + if (Result) return true; - } - return false; + + if (!Str.equals_insensitive(S)) + return false; + + Result = std::move(Value); + return true; } StringSwitch &CasesImpl(std::initializer_list<StringLiteral> Cases, diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp index c94feb54d0b7d..75d50f4dd1b5b 100644 --- a/llvm/unittests/ADT/StringSwitchTest.cpp +++ b/llvm/unittests/ADT/StringSwitchTest.cpp @@ -240,6 +240,23 @@ TEST(StringSwitchTest, CasesCopies) { EXPECT_EQ(NumCopies, 1u); } +TEST(StringSwitchTest, StringSwitchMultipleMatches) { + auto Translate = [](StringRef S) { + return llvm::StringSwitch<int>(S) + .CaseLower("A", 0) + .Case("b", 1) + .Case("a", 2) + .CasesLower({"a", "b"}, 3) + .DefaultUnreachable(); + }; + + // Check that the value of the first match is returned. + EXPECT_EQ(0, Translate("A")); + EXPECT_EQ(0, Translate("a")); + EXPECT_EQ(3, Translate("B")); + EXPECT_EQ(1, Translate("b")); +} + TEST(StringSwitchTest, DefaultUnreachable) { auto Translate = [](StringRef S) { return llvm::StringSwitch<int>(S) From 96cd0dd335ad8b556235b249d2cdffeaf75892b8 Mon Sep 17 00:00:00 2001 From: Kazu Hirata <kazu@google.com> Date: Mon, 3 Nov 2025 08:41:09 -0800 Subject: [PATCH 031/313] [clang] Migrate away from a soft-deprecated constructor of APInt (NFC) (#166127) We have: /// Once all uses of this constructor are migrated to other constructors, /// consider marking this overload ""= delete" to prevent calls from being /// incorrectly bound to the APInt(unsigned, uint64_t, bool) constructor. LLVM_ABI APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]); This patch migrates away from this soft-deprecated constructor. --- clang/include/clang/AST/APNumericStorage.h | 5 ++--- clang/include/clang/AST/AbstractBasicReader.h | 2 +- clang/lib/AST/ByteCode/Floating.h | 3 ++- clang/lib/AST/ByteCode/IntegralAP.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/AST/APNumericStorage.h b/clang/include/clang/AST/APNumericStorage.h index e1948a552bf7e..04424086b98cf 100644 --- a/clang/include/clang/AST/APNumericStorage.h +++ b/clang/include/clang/AST/APNumericStorage.h @@ -41,9 +41,8 @@ class APNumericStorage { llvm::APInt getIntValue() const { unsigned NumWords = llvm::APInt::getNumWords(BitWidth); if (NumWords > 1) - return llvm::APInt(BitWidth, NumWords, pVal); - else - return llvm::APInt(BitWidth, VAL); + return llvm::APInt(BitWidth, llvm::ArrayRef(pVal, NumWords)); + return llvm::APInt(BitWidth, VAL); } void setIntValue(const ASTContext &C, const llvm::APInt &Val); }; diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h index 0d187eb49d6ca..064a342aa0684 100644 --- a/clang/include/clang/AST/AbstractBasicReader.h +++ b/clang/include/clang/AST/AbstractBasicReader.h @@ -173,7 +173,7 @@ class DataStreamBasicReader : public BasicReaderBase<Impl> { llvm::SmallVector<uint64_t, 4> data; for (uint32_t i = 0; i != numWords; ++i) data.push_back(asImpl().readUInt64()); - return llvm::APInt(bitWidth, numWords, &data[0]); + return llvm::APInt(bitWidth, data); } llvm::FixedPointSemantics readFixedPointSemantics() { diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h index 659892e720abf..cc918dc12deb6 100644 --- a/clang/lib/AST/ByteCode/Floating.h +++ b/clang/lib/AST/ByteCode/Floating.h @@ -45,7 +45,8 @@ class Floating final { if (singleWord()) return APFloat(getSemantics(), APInt(BitWidth, Val)); unsigned NumWords = numWords(); - return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory)); + return APFloat(getSemantics(), + APInt(BitWidth, llvm::ArrayRef(Memory, NumWords))); } public: diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h index 6683db941c736..b11e6eea28e3f 100644 --- a/clang/lib/AST/ByteCode/IntegralAP.h +++ b/clang/lib/AST/ByteCode/IntegralAP.h @@ -63,7 +63,7 @@ template <bool Signed> class IntegralAP final { if (singleWord()) return APInt(BitWidth, Val, Signed); unsigned NumWords = llvm::APInt::getNumWords(BitWidth); - return llvm::APInt(BitWidth, NumWords, Memory); + return llvm::APInt(BitWidth, llvm::ArrayRef(Memory, NumWords)); } public: From 2458e15e5c819e90e308b28a8d96f23a3d685af1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata <kazu@google.com> Date: Mon, 3 Nov 2025 08:41:17 -0800 Subject: [PATCH 032/313] [mlir] Migrate away from a soft-deprecated constructor of APInt (NFC) (#166128) We have: /// Once all uses of this constructor are migrated to other constructors, /// consider marking this overload ""= delete" to prevent calls from being /// incorrectly bound to the APInt(unsigned, uint64_t, bool) constructor. LLVM_ABI APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]); This patch migrates away from this soft-deprecated constructor. --- mlir/lib/AsmParser/Parser.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp index 82bdb844480f1..74936e32bd9d9 100644 --- a/mlir/lib/AsmParser/Parser.cpp +++ b/mlir/lib/AsmParser/Parser.cpp @@ -407,8 +407,8 @@ Parser::parseFloatFromIntegerLiteral(std::optional<APFloat> &result, "hexadecimal float constant out of range for type"); } - APInt truncatedValue(typeSizeInBits, intValue.getNumWords(), - intValue.getRawData()); + APInt truncatedValue(typeSizeInBits, + ArrayRef(intValue.getRawData(), intValue.getNumWords())); result.emplace(semantics, truncatedValue); return success(); } From 11c2923ccc6ec7f67049e9ea151467224814dbe5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata <kazu@google.com> Date: Mon, 3 Nov 2025 08:41:25 -0800 Subject: [PATCH 033/313] [ADT] Use "using" instead of "typedef" (NFC) (#166129) Identified with modernize-use-using. --- llvm/include/llvm/ADT/APFloat.h | 6 +-- llvm/include/llvm/ADT/APInt.h | 2 +- llvm/include/llvm/ADT/BitVector.h | 6 +-- llvm/include/llvm/ADT/GenericSSAContext.h | 2 +- llvm/include/llvm/ADT/STLExtras.h | 4 +- llvm/include/llvm/ADT/ilist.h | 26 +++++------ llvm/include/llvm/ADT/ilist_node_options.h | 43 ++++++++++--------- llvm/unittests/ADT/APFloatTest.cpp | 2 +- llvm/unittests/ADT/BitVectorTest.cpp | 4 +- .../ADT/BreadthFirstIteratorTest.cpp | 4 +- llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp | 2 +- llvm/unittests/ADT/DenseMapTest.cpp | 23 +++++----- llvm/unittests/ADT/DenseSetTest.cpp | 14 +++--- llvm/unittests/ADT/DepthFirstIteratorTest.cpp | 6 +-- llvm/unittests/ADT/IListBaseTest.cpp | 25 +++++------ llvm/unittests/ADT/IListIteratorTest.cpp | 8 ++-- llvm/unittests/ADT/IListNodeBaseTest.cpp | 8 ++-- llvm/unittests/ADT/IListSentinelTest.cpp | 15 +++---- llvm/unittests/ADT/IntervalMapTest.cpp | 10 ++--- llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp | 6 +-- llvm/unittests/ADT/IteratorTest.cpp | 9 ++-- llvm/unittests/ADT/PointerSumTypeTest.cpp | 7 ++- llvm/unittests/ADT/PointerUnionTest.cpp | 12 +++--- llvm/unittests/ADT/PostOrderIteratorTest.cpp | 2 +- llvm/unittests/ADT/PriorityWorklistTest.cpp | 4 +- llvm/unittests/ADT/RangeAdapterTest.cpp | 30 ++++++------- llvm/unittests/ADT/SCCIteratorTest.cpp | 2 +- llvm/unittests/ADT/STLExtrasTest.cpp | 8 ++-- llvm/unittests/ADT/SimpleIListTest.cpp | 8 ++-- llvm/unittests/ADT/SmallPtrSetTest.cpp | 2 +- llvm/unittests/ADT/SmallStringTest.cpp | 2 +- llvm/unittests/ADT/SmallVectorTest.cpp | 26 +++++------ llvm/unittests/ADT/SparseMultiSetTest.cpp | 4 +- llvm/unittests/ADT/SparseSetTest.cpp | 4 +- llvm/unittests/ADT/TestGraph.h | 8 ++-- llvm/unittests/ADT/TinyPtrVectorTest.cpp | 12 +++--- 36 files changed, 175 insertions(+), 181 deletions(-) diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index bccdb8930561e..82ac9a3a1ef80 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -152,7 +152,7 @@ class APFloatBase { static constexpr unsigned integerPartWidth = APInt::APINT_BITS_PER_WORD; /// A signed type to represent a floating point numbers unbiased exponent. - typedef int32_t ExponentType; + using ExponentType = int32_t; /// \name Floating Point Semantics. /// @{ @@ -938,8 +938,8 @@ LLVM_ABI DoubleAPFloat frexp(const DoubleAPFloat &X, int &Exp, roundingMode); // This is a interface class that is currently forwarding functionalities from // detail::IEEEFloat. class APFloat : public APFloatBase { - typedef detail::IEEEFloat IEEEFloat; - typedef detail::DoubleAPFloat DoubleAPFloat; + using IEEEFloat = detail::IEEEFloat; + using DoubleAPFloat = detail::DoubleAPFloat; static_assert(std::is_standard_layout<IEEEFloat>::value); diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 9fa98ad4ddde1..26283d2437d48 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -77,7 +77,7 @@ inline APInt operator-(APInt); /// class [[nodiscard]] APInt { public: - typedef uint64_t WordType; + using WordType = uint64_t; /// Byte size of a word. static constexpr unsigned APINT_WORD_SIZE = sizeof(WordType); diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h index 9e81a4b735e7f..cc3f3a9226395 100644 --- a/llvm/include/llvm/ADT/BitVector.h +++ b/llvm/include/llvm/ADT/BitVector.h @@ -99,7 +99,7 @@ template <typename BitVectorT> class const_set_bits_iterator_impl { }; class BitVector { - typedef uintptr_t BitWord; + using BitWord = uintptr_t; enum { BITWORD_SIZE = (unsigned)sizeof(BitWord) * CHAR_BIT }; @@ -147,8 +147,8 @@ class BitVector { } }; - typedef const_set_bits_iterator_impl<BitVector> const_set_bits_iterator; - typedef const_set_bits_iterator set_iterator; + using const_set_bits_iterator = const_set_bits_iterator_impl<BitVector>; + using set_iterator = const_set_bits_iterator; const_set_bits_iterator set_bits_begin() const { return const_set_bits_iterator(*this); diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h index e9f99bafe9f1e..426a083778d6e 100644 --- a/llvm/include/llvm/ADT/GenericSSAContext.h +++ b/llvm/include/llvm/ADT/GenericSSAContext.h @@ -25,7 +25,7 @@ template <typename, bool> class DominatorTreeBase; template <typename> class SmallVectorImpl; namespace Intrinsic { -typedef unsigned ID; +using ID = unsigned; } // Specializations of this template should provide the types used by the diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h index a9841c6651b72..8de8eb5b86640 100644 --- a/llvm/include/llvm/ADT/STLExtras.h +++ b/llvm/include/llvm/ADT/STLExtras.h @@ -1516,8 +1516,8 @@ template <class Iterator, class RNG> void shuffle(Iterator first, Iterator last, RNG &&g) { // It would be better to use a std::uniform_int_distribution, // but that would be stdlib dependent. - typedef - typename std::iterator_traits<Iterator>::difference_type difference_type; + using difference_type = + typename std::iterator_traits<Iterator>::difference_type; for (auto size = last - first; size > 1; ++first, (void)--size) { difference_type offset = g() % size; // Avoid self-assignment due to incorrect assertions in libstdc++ diff --git a/llvm/include/llvm/ADT/ilist.h b/llvm/include/llvm/ADT/ilist.h index aed19ccbff7f2..64392903bec74 100644 --- a/llvm/include/llvm/ADT/ilist.h +++ b/llvm/include/llvm/ADT/ilist.h @@ -108,21 +108,21 @@ template <typename Ty> struct ilist_traits<const Ty> {}; /// list. template <class IntrusiveListT, class TraitsT> class iplist_impl : public TraitsT, IntrusiveListT { - typedef IntrusiveListT base_list_type; + using base_list_type = IntrusiveListT; public: - typedef typename base_list_type::pointer pointer; - typedef typename base_list_type::const_pointer const_pointer; - typedef typename base_list_type::reference reference; - typedef typename base_list_type::const_reference const_reference; - typedef typename base_list_type::value_type value_type; - typedef typename base_list_type::size_type size_type; - typedef typename base_list_type::difference_type difference_type; - typedef typename base_list_type::iterator iterator; - typedef typename base_list_type::const_iterator const_iterator; - typedef typename base_list_type::reverse_iterator reverse_iterator; - typedef - typename base_list_type::const_reverse_iterator const_reverse_iterator; + using pointer = typename base_list_type::pointer; + using const_pointer = typename base_list_type::const_pointer; + using reference = typename base_list_type::reference; + using const_reference = typename base_list_type::const_reference; + using value_type = typename base_list_type::value_type; + using size_type = typename base_list_type::size_type; + using difference_type = typename base_list_type::difference_type; + using iterator = typename base_list_type::iterator; + using const_iterator = typename base_list_type::const_iterator; + using reverse_iterator = typename base_list_type::reverse_iterator; + using const_reverse_iterator = + typename base_list_type::const_reverse_iterator; private: static bool op_less(const_reference L, const_reference R) { return L < R; } diff --git a/llvm/include/llvm/ADT/ilist_node_options.h b/llvm/include/llvm/ADT/ilist_node_options.h index 003d5dabce897..53719b07a3768 100644 --- a/llvm/include/llvm/ADT/ilist_node_options.h +++ b/llvm/include/llvm/ADT/ilist_node_options.h @@ -58,8 +58,8 @@ namespace ilist_detail { template <bool IsExplicit> struct explicitness { static const bool is_explicit = IsExplicit; }; -typedef explicitness<true> is_explicit; -typedef explicitness<false> is_implicit; +using is_explicit = explicitness<true>; +using is_implicit = explicitness<false>; /// Check whether an option is valid. /// @@ -103,12 +103,12 @@ struct is_valid_option<ilist_sentinel_tracking<EnableSentinelTracking>> template <class... Options> struct extract_tag; template <class Tag, class... Options> struct extract_tag<ilist_tag<Tag>, Options...> { - typedef Tag type; + using type = Tag; }; template <class Option1, class... Options> struct extract_tag<Option1, Options...> : extract_tag<Options...> {}; template <> struct extract_tag<> { - typedef void type; + using type = void; }; template <class Tag> struct is_valid_option<ilist_tag<Tag>> : std::true_type {}; @@ -134,11 +134,13 @@ struct is_valid_option<ilist_iterator_bits<IteratorBits>> : std::true_type {}; template <class... Options> struct extract_parent; template <class ParentTy, class... Options> struct extract_parent<ilist_parent<ParentTy>, Options...> { - typedef ParentTy type; + using type = ParentTy; }; template <class Option1, class... Options> struct extract_parent<Option1, Options...> : extract_parent<Options...> {}; -template <> struct extract_parent<> { typedef void type; }; +template <> struct extract_parent<> { + using type = void; +}; template <class ParentTy> struct is_valid_option<ilist_parent<ParentTy>> : std::true_type {}; @@ -154,28 +156,27 @@ struct check_options : std::conjunction<is_valid_option<Options>...> {}; template <class T, bool EnableSentinelTracking, bool IsSentinelTrackingExplicit, class TagT, bool HasIteratorBits, class ParentTy> struct node_options { - typedef T value_type; - typedef T *pointer; - typedef T &reference; - typedef const T *const_pointer; - typedef const T &const_reference; + using value_type = T; + using pointer = T *; + using reference = T &; + using const_pointer = const T *; + using const_reference = const T &; static const bool enable_sentinel_tracking = EnableSentinelTracking; static const bool is_sentinel_tracking_explicit = IsSentinelTrackingExplicit; static const bool has_iterator_bits = HasIteratorBits; - typedef TagT tag; - typedef ParentTy parent_ty; - typedef ilist_node_base<enable_sentinel_tracking, parent_ty> node_base_type; - typedef ilist_base<enable_sentinel_tracking, parent_ty> list_base_type; + using tag = TagT; + using parent_ty = ParentTy; + using node_base_type = ilist_node_base<enable_sentinel_tracking, parent_ty>; + using list_base_type = ilist_base<enable_sentinel_tracking, parent_ty>; }; template <class T, class... Options> struct compute_node_options { - typedef node_options<T, extract_sentinel_tracking<Options...>::value, - extract_sentinel_tracking<Options...>::is_explicit, - typename extract_tag<Options...>::type, - extract_iterator_bits<Options...>::value, - typename extract_parent<Options...>::type> - type; + using type = node_options<T, extract_sentinel_tracking<Options...>::value, + extract_sentinel_tracking<Options...>::is_explicit, + typename extract_tag<Options...>::type, + extract_iterator_bits<Options...>::value, + typename extract_parent<Options...>::type>; }; } // end namespace ilist_detail diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp index fbe96bb127836..99cc38b6b422b 100644 --- a/llvm/unittests/ADT/APFloatTest.cpp +++ b/llvm/unittests/ADT/APFloatTest.cpp @@ -10118,7 +10118,7 @@ TEST(APFloatTest, Float4E2M1FNToFloat) { } TEST(APFloatTest, AddOrSubtractSignificand) { - typedef detail::IEEEFloatUnitTestHelper Helper; + using Helper = detail::IEEEFloatUnitTestHelper; // Test cases are all combinations of: // {equal exponents, LHS larger exponent, RHS larger exponent} // {equal significands, LHS larger significand, RHS larger significand} diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp index 12ba0041af551..e13523b8e10c3 100644 --- a/llvm/unittests/ADT/BitVectorTest.cpp +++ b/llvm/unittests/ADT/BitVectorTest.cpp @@ -21,7 +21,7 @@ template <typename T> class BitVectorTest : public ::testing::Test { }; // Test both BitVector and SmallBitVector with the same suite of tests. -typedef ::testing::Types<BitVector, SmallBitVector> BitVectorTestTypes; +using BitVectorTestTypes = ::testing::Types<BitVector, SmallBitVector>; TYPED_TEST_SUITE(BitVectorTest, BitVectorTestTypes, ); TYPED_TEST(BitVectorTest, TrivialOperation) { @@ -857,7 +857,7 @@ TYPED_TEST(BitVectorTest, BinOps) { EXPECT_FALSE(B.anyCommon(A)); } -typedef std::vector<std::pair<int, int>> RangeList; +using RangeList = std::vector<std::pair<int, int>>; template <typename VecType> static inline VecType createBitVector(uint32_t Size, diff --git a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp index a737390e79d8d..571e4d27c6752 100644 --- a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp +++ b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp @@ -21,7 +21,7 @@ using namespace llvm; namespace llvm { TEST(BreadthFristIteratorTest, Basic) { - typedef bf_iterator<Graph<4>> BFIter; + using BFIter = bf_iterator<Graph<4>>; Graph<4> G; G.AddEdge(0, 1); @@ -46,7 +46,7 @@ TEST(BreadthFristIteratorTest, Basic) { } TEST(BreadthFristIteratorTest, Cycle) { - typedef bf_iterator<Graph<4>> BFIter; + using BFIter = bf_iterator<Graph<4>>; Graph<4> G; G.AddEdge(0, 1); diff --git a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp index f543947899393..918a2e63da935 100644 --- a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp +++ b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp @@ -15,7 +15,7 @@ using namespace llvm; namespace { -typedef DAGDeltaAlgorithm::edge_ty edge_ty; +using edge_ty = DAGDeltaAlgorithm::edge_ty; class FixedDAGDeltaAlgorithm : public DAGDeltaAlgorithm { changeset_ty FailingSet; diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp index aceb4f30d878d..273ee09fc1e28 100644 --- a/llvm/unittests/ADT/DenseMapTest.cpp +++ b/llvm/unittests/ADT/DenseMapTest.cpp @@ -129,18 +129,17 @@ typename T::mapped_type *const DenseMapTest<T>::dummy_value_ptr = nullptr; // Register these types for testing. // clang-format off -typedef ::testing::Types<DenseMap<uint32_t, uint32_t>, - DenseMap<uint32_t *, uint32_t *>, - DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>, - DenseMap<EnumClass, uint32_t>, - DenseMap<std::optional<uint32_t>, uint32_t>, - SmallDenseMap<uint32_t, uint32_t>, - SmallDenseMap<uint32_t *, uint32_t *>, - SmallDenseMap<CtorTester, CtorTester, 4, - CtorTesterMapInfo>, - SmallDenseMap<EnumClass, uint32_t>, - SmallDenseMap<std::optional<uint32_t>, uint32_t> - > DenseMapTestTypes; +using DenseMapTestTypes = ::testing::Types< + DenseMap<uint32_t, uint32_t>, + DenseMap<uint32_t *, uint32_t *>, + DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>, + DenseMap<EnumClass, uint32_t>, + DenseMap<std::optional<uint32_t>, uint32_t>, + SmallDenseMap<uint32_t, uint32_t>, + SmallDenseMap<uint32_t *, uint32_t *>, + SmallDenseMap<CtorTester, CtorTester, 4, CtorTesterMapInfo>, + SmallDenseMap<EnumClass, uint32_t>, + SmallDenseMap<std::optional<uint32_t>, uint32_t>>; // clang-format on TYPED_TEST_SUITE(DenseMapTest, DenseMapTestTypes, ); diff --git a/llvm/unittests/ADT/DenseSetTest.cpp b/llvm/unittests/ADT/DenseSetTest.cpp index a24f99b6bb34f..a2a062b151b67 100644 --- a/llvm/unittests/ADT/DenseSetTest.cpp +++ b/llvm/unittests/ADT/DenseSetTest.cpp @@ -96,13 +96,13 @@ template <typename T> class DenseSetTest : public testing::Test { }; // Register these types for testing. -typedef ::testing::Types<DenseSet<unsigned, TestDenseSetInfo>, - const DenseSet<unsigned, TestDenseSetInfo>, - SmallDenseSet<unsigned, 1, TestDenseSetInfo>, - SmallDenseSet<unsigned, 4, TestDenseSetInfo>, - const SmallDenseSet<unsigned, 4, TestDenseSetInfo>, - SmallDenseSet<unsigned, 64, TestDenseSetInfo>> - DenseSetTestTypes; +using DenseSetTestTypes = + ::testing::Types<DenseSet<unsigned, TestDenseSetInfo>, + const DenseSet<unsigned, TestDenseSetInfo>, + SmallDenseSet<unsigned, 1, TestDenseSetInfo>, + SmallDenseSet<unsigned, 4, TestDenseSetInfo>, + const SmallDenseSet<unsigned, 4, TestDenseSetInfo>, + SmallDenseSet<unsigned, 64, TestDenseSetInfo>>; TYPED_TEST_SUITE(DenseSetTest, DenseSetTestTypes, ); TYPED_TEST(DenseSetTest, Constructor) { diff --git a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp index f792878004e7a..00312ca6044e6 100644 --- a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp +++ b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp @@ -21,7 +21,7 @@ using namespace llvm; namespace llvm { template <typename T> struct CountedSet { - typedef typename SmallPtrSet<T, 4>::iterator iterator; + using iterator = typename SmallPtrSet<T, 4>::iterator; SmallPtrSet<T, 4> S; int InsertVisited = 0; @@ -44,8 +44,8 @@ template <typename T> class df_iterator_storage<CountedSet<T>, true> { }; TEST(DepthFirstIteratorTest, ActuallyUpdateIterator) { - typedef CountedSet<Graph<3>::NodeType *> StorageT; - typedef df_iterator<Graph<3>, StorageT, true> DFIter; + using StorageT = CountedSet<Graph<3>::NodeType *>; + using DFIter = df_iterator<Graph<3>, StorageT, true>; Graph<3> G; G.AddEdge(0, 1); diff --git a/llvm/unittests/ADT/IListBaseTest.cpp b/llvm/unittests/ADT/IListBaseTest.cpp index bd915688b190d..eeed488c28d88 100644 --- a/llvm/unittests/ADT/IListBaseTest.cpp +++ b/llvm/unittests/ADT/IListBaseTest.cpp @@ -19,13 +19,14 @@ template <typename T> class IListBaseTest : public ::testing::Test {}; class Parent; // Test variants with the same test. -typedef ::testing::Types<ilist_base<false, void>, ilist_base<true, void>, ilist_base<false, Parent*>, ilist_base<true, Parent*>> - IListBaseTestTypes; +using IListBaseTestTypes = + ::testing::Types<ilist_base<false, void>, ilist_base<true, void>, + ilist_base<false, Parent *>, ilist_base<true, Parent *>>; TYPED_TEST_SUITE(IListBaseTest, IListBaseTestTypes, ); TYPED_TEST(IListBaseTest, insertBeforeImpl) { - typedef TypeParam list_base_type; - typedef typename list_base_type::node_base_type node_base_type; + using list_base_type = TypeParam; + using node_base_type = typename list_base_type::node_base_type; node_base_type S, A, B; @@ -51,8 +52,8 @@ TYPED_TEST(IListBaseTest, insertBeforeImpl) { } TYPED_TEST(IListBaseTest, removeImpl) { - typedef TypeParam list_base_type; - typedef typename list_base_type::node_base_type node_base_type; + using list_base_type = TypeParam; + using node_base_type = typename list_base_type::node_base_type; node_base_type S, A, B; @@ -80,8 +81,8 @@ TYPED_TEST(IListBaseTest, removeImpl) { } TYPED_TEST(IListBaseTest, removeRangeImpl) { - typedef TypeParam list_base_type; - typedef typename list_base_type::node_base_type node_base_type; + using list_base_type = TypeParam; + using node_base_type = typename list_base_type::node_base_type; node_base_type S, A, B, C, D; @@ -106,8 +107,8 @@ TYPED_TEST(IListBaseTest, removeRangeImpl) { } TYPED_TEST(IListBaseTest, removeRangeImplAllButSentinel) { - typedef TypeParam list_base_type; - typedef typename list_base_type::node_base_type node_base_type; + using list_base_type = TypeParam; + using node_base_type = typename list_base_type::node_base_type; node_base_type S, A, B; @@ -126,8 +127,8 @@ TYPED_TEST(IListBaseTest, removeRangeImplAllButSentinel) { } TYPED_TEST(IListBaseTest, transferBeforeImpl) { - typedef TypeParam list_base_type; - typedef typename list_base_type::node_base_type node_base_type; + using list_base_type = TypeParam; + using node_base_type = typename list_base_type::node_base_type; node_base_type S1, S2, A, B, C, D, E; diff --git a/llvm/unittests/ADT/IListIteratorTest.cpp b/llvm/unittests/ADT/IListIteratorTest.cpp index 4e5b847b35ffe..54a4258246e9b 100644 --- a/llvm/unittests/ADT/IListIteratorTest.cpp +++ b/llvm/unittests/ADT/IListIteratorTest.cpp @@ -141,10 +141,10 @@ TEST(IListIteratorTest, ReverseConstructor) { L.insert(L.end(), B); // Save typing. - typedef simple_ilist<Node>::iterator iterator; - typedef simple_ilist<Node>::reverse_iterator reverse_iterator; - typedef simple_ilist<Node>::const_iterator const_iterator; - typedef simple_ilist<Node>::const_reverse_iterator const_reverse_iterator; + using iterator = simple_ilist<Node>::iterator; + using reverse_iterator = simple_ilist<Node>::reverse_iterator; + using const_iterator = simple_ilist<Node>::const_iterator; + using const_reverse_iterator = simple_ilist<Node>::const_reverse_iterator; // Check conversion values. EXPECT_EQ(L.begin(), iterator(L.rend())); diff --git a/llvm/unittests/ADT/IListNodeBaseTest.cpp b/llvm/unittests/ADT/IListNodeBaseTest.cpp index ef90c716a4118..393f83af99b76 100644 --- a/llvm/unittests/ADT/IListNodeBaseTest.cpp +++ b/llvm/unittests/ADT/IListNodeBaseTest.cpp @@ -17,10 +17,10 @@ namespace { class Parent {}; -typedef ilist_node_base<false, void> RawNode; -typedef ilist_node_base<true, void> TrackingNode; -typedef ilist_node_base<false, Parent> ParentNode; -typedef ilist_node_base<true, Parent> ParentTrackingNode; +using RawNode = ilist_node_base<false, void>; +using TrackingNode = ilist_node_base<true, void>; +using ParentNode = ilist_node_base<false, Parent>; +using ParentTrackingNode = ilist_node_base<true, Parent>; TEST(IListNodeBaseTest, DefaultConstructor) { RawNode A; diff --git a/llvm/unittests/ADT/IListSentinelTest.cpp b/llvm/unittests/ADT/IListSentinelTest.cpp index 1f4a8311370a6..709a1a4bb90e7 100644 --- a/llvm/unittests/ADT/IListSentinelTest.cpp +++ b/llvm/unittests/ADT/IListSentinelTest.cpp @@ -14,18 +14,17 @@ using namespace llvm; namespace { template <class T, class... Options> struct PickSentinel { - typedef ilist_sentinel< - typename ilist_detail::compute_node_options<T, Options...>::type> - type; + using type = ilist_sentinel< + typename ilist_detail::compute_node_options<T, Options...>::type>; }; class Node : public ilist_node<Node> {}; class TrackingNode : public ilist_node<Node, ilist_sentinel_tracking<true>> {}; -typedef PickSentinel<Node>::type Sentinel; -typedef PickSentinel<Node, ilist_sentinel_tracking<true>>::type - TrackingSentinel; -typedef PickSentinel<Node, ilist_sentinel_tracking<false>>::type - NoTrackingSentinel; +using Sentinel = PickSentinel<Node>::type; +using TrackingSentinel = + PickSentinel<Node, ilist_sentinel_tracking<true>>::type; +using NoTrackingSentinel = + PickSentinel<Node, ilist_sentinel_tracking<false>>::type; struct LocalAccess : ilist_detail::NodeAccess { using NodeAccess::getPrev; diff --git a/llvm/unittests/ADT/IntervalMapTest.cpp b/llvm/unittests/ADT/IntervalMapTest.cpp index 99a93ab198d89..38f397ff2eb54 100644 --- a/llvm/unittests/ADT/IntervalMapTest.cpp +++ b/llvm/unittests/ADT/IntervalMapTest.cpp @@ -14,9 +14,9 @@ using namespace llvm; namespace { -typedef IntervalMap<unsigned, unsigned, 4> UUMap; -typedef IntervalMap<unsigned, unsigned, 4, - IntervalMapHalfOpenInfo<unsigned>> UUHalfOpenMap; +using UUMap = IntervalMap<unsigned, unsigned, 4>; +using UUHalfOpenMap = + IntervalMap<unsigned, unsigned, 4, IntervalMapHalfOpenInfo<unsigned>>; // Empty map tests TEST(IntervalMapTest, EmptyMap) { @@ -713,7 +713,7 @@ TEST(IntervalMapTest, OverlapsHalfOpen) { } TEST(IntervalMapOverlapsTest, SmallMaps) { - typedef IntervalMapOverlaps<UUMap,UUMap> UUOverlaps; + using UUOverlaps = IntervalMapOverlaps<UUMap, UUMap>; UUMap::Allocator allocator; UUMap mapA(allocator); UUMap mapB(allocator); @@ -757,7 +757,7 @@ TEST(IntervalMapOverlapsTest, SmallMaps) { } TEST(IntervalMapOverlapsTest, BigMaps) { - typedef IntervalMapOverlaps<UUMap,UUMap> UUOverlaps; + using UUOverlaps = IntervalMapOverlaps<UUMap, UUMap>; UUMap::Allocator allocator; UUMap mapA(allocator); UUMap mapB(allocator); diff --git a/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp b/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp index f4f2083482804..6da42271764bc 100644 --- a/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp +++ b/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp @@ -25,9 +25,9 @@ struct SimpleRefCounted : Base<SimpleRefCounted<Base>> { template <typename T> struct IntrusiveRefCntPtrTest : testing::Test {}; -typedef ::testing::Types<SimpleRefCounted<RefCountedBase>, - SimpleRefCounted<ThreadSafeRefCountedBase>> - IntrusiveRefCntTypes; +using IntrusiveRefCntTypes = + ::testing::Types<SimpleRefCounted<RefCountedBase>, + SimpleRefCounted<ThreadSafeRefCountedBase>>; TYPED_TEST_SUITE(IntrusiveRefCntPtrTest, IntrusiveRefCntTypes, ); TYPED_TEST(IntrusiveRefCntPtrTest, RefCountedBaseCopyDoesNotLeak) { diff --git a/llvm/unittests/ADT/IteratorTest.cpp b/llvm/unittests/ADT/IteratorTest.cpp index b5d63efd8ccba..9dd8c1a84f44a 100644 --- a/llvm/unittests/ADT/IteratorTest.cpp +++ b/llvm/unittests/ADT/IteratorTest.cpp @@ -177,8 +177,8 @@ TEST(PointeeIteratorTest, Basic) { V.push_back(&arr[2]); V.push_back(&arr[3]); - typedef pointee_iterator<SmallVectorImpl<int *>::const_iterator> - test_iterator; + using test_iterator = + pointee_iterator<SmallVectorImpl<int *>::const_iterator>; test_iterator Begin, End; Begin = V.begin(); @@ -218,9 +218,8 @@ TEST(PointeeIteratorTest, SmartPointer) { V.push_back(std::make_unique<int>(3)); V.push_back(std::make_unique<int>(4)); - typedef pointee_iterator< - SmallVectorImpl<std::unique_ptr<int>>::const_iterator> - test_iterator; + using test_iterator = + pointee_iterator<SmallVectorImpl<std::unique_ptr<int>>::const_iterator>; test_iterator Begin, End; Begin = V.begin(); diff --git a/llvm/unittests/ADT/PointerSumTypeTest.cpp b/llvm/unittests/ADT/PointerSumTypeTest.cpp index fbf59f3a2fda5..11e657ad8bd25 100644 --- a/llvm/unittests/ADT/PointerSumTypeTest.cpp +++ b/llvm/unittests/ADT/PointerSumTypeTest.cpp @@ -17,10 +17,9 @@ struct PointerSumTypeTest : public testing::Test { float f; int i1, i2; - typedef PointerSumType<Kinds, PointerSumTypeMember<Float, float *>, - PointerSumTypeMember<Int1, int *>, - PointerSumTypeMember<Int2, int *>> - SumType; + using SumType = PointerSumType<Kinds, PointerSumTypeMember<Float, float *>, + PointerSumTypeMember<Int1, int *>, + PointerSumTypeMember<Int2, int *>>; SumType a, b, c, n; PointerSumTypeTest() diff --git a/llvm/unittests/ADT/PointerUnionTest.cpp b/llvm/unittests/ADT/PointerUnionTest.cpp index acddb78960149..d8ac3aed76da2 100644 --- a/llvm/unittests/ADT/PointerUnionTest.cpp +++ b/llvm/unittests/ADT/PointerUnionTest.cpp @@ -12,9 +12,9 @@ using namespace llvm; namespace { -typedef PointerUnion<int *, float *> PU; -typedef PointerUnion<int *, float *, long long *> PU3; -typedef PointerUnion<int *, float *, long long *, double *> PU4; +using PU = PointerUnion<int *, float *>; +using PU3 = PointerUnion<int *, float *, long long *>; +using PU4 = PointerUnion<int *, float *, long long *, double *>; struct PointerUnionTest : public testing::Test { float f; @@ -116,9 +116,9 @@ TEST_F(PointerUnionTest, Get) { template<int I> struct alignas(8) Aligned {}; -typedef PointerUnion<Aligned<0> *, Aligned<1> *, Aligned<2> *, Aligned<3> *, - Aligned<4> *, Aligned<5> *, Aligned<6> *, Aligned<7> *> - PU8; +using PU8 = + PointerUnion<Aligned<0> *, Aligned<1> *, Aligned<2> *, Aligned<3> *, + Aligned<4> *, Aligned<5> *, Aligned<6> *, Aligned<7> *>; TEST_F(PointerUnionTest, ManyElements) { Aligned<0> a0; diff --git a/llvm/unittests/ADT/PostOrderIteratorTest.cpp b/llvm/unittests/ADT/PostOrderIteratorTest.cpp index 838481f76ed7f..e875dd63a1958 100644 --- a/llvm/unittests/ADT/PostOrderIteratorTest.cpp +++ b/llvm/unittests/ADT/PostOrderIteratorTest.cpp @@ -23,7 +23,7 @@ namespace { // Whether we're able to compile TEST(PostOrderIteratorTest, Compiles) { - typedef SmallPtrSet<void *, 4> ExtSetTy; + using ExtSetTy = SmallPtrSet<void *, 4>; // Tests that template specializations are kept up to date void *Null = nullptr; diff --git a/llvm/unittests/ADT/PriorityWorklistTest.cpp b/llvm/unittests/ADT/PriorityWorklistTest.cpp index f12d32ac9f496..08a47736c392e 100644 --- a/llvm/unittests/ADT/PriorityWorklistTest.cpp +++ b/llvm/unittests/ADT/PriorityWorklistTest.cpp @@ -20,8 +20,8 @@ namespace { using namespace llvm; template <typename T> class PriorityWorklistTest : public ::testing::Test {}; -typedef ::testing::Types<PriorityWorklist<int>, SmallPriorityWorklist<int, 2>> - TestTypes; +using TestTypes = + ::testing::Types<PriorityWorklist<int>, SmallPriorityWorklist<int, 2>>; TYPED_TEST_SUITE(PriorityWorklistTest, TestTypes, ); TYPED_TEST(PriorityWorklistTest, Basic) { diff --git a/llvm/unittests/ADT/RangeAdapterTest.cpp b/llvm/unittests/ADT/RangeAdapterTest.cpp index c1a8a984f233b..6849ccbc8052d 100644 --- a/llvm/unittests/ADT/RangeAdapterTest.cpp +++ b/llvm/unittests/ADT/RangeAdapterTest.cpp @@ -24,8 +24,8 @@ class ReverseOnlyVector { public: ReverseOnlyVector(std::initializer_list<int> list) : Vec(list) {} - typedef std::vector<int>::reverse_iterator reverse_iterator; - typedef std::vector<int>::const_reverse_iterator const_reverse_iterator; + using reverse_iterator = std::vector<int>::reverse_iterator; + using const_reverse_iterator = std::vector<int>::const_reverse_iterator; reverse_iterator rbegin() { return Vec.rbegin(); } reverse_iterator rend() { return Vec.rend(); } const_reverse_iterator rbegin() const { return Vec.rbegin(); } @@ -41,11 +41,11 @@ class BidirectionalVector { public: BidirectionalVector(std::initializer_list<int> list) : Vec(list) {} - typedef std::vector<int>::iterator iterator; + using iterator = std::vector<int>::iterator; iterator begin() const; iterator end() const; - typedef std::vector<int>::reverse_iterator reverse_iterator; + using reverse_iterator = std::vector<int>::reverse_iterator; reverse_iterator rbegin() const { return Vec.rbegin(); } reverse_iterator rend() const { return Vec.rend(); } }; @@ -58,15 +58,15 @@ class BidirectionalVectorConsts { public: BidirectionalVectorConsts(std::initializer_list<int> list) : Vec(list) {} - typedef std::vector<int>::iterator iterator; - typedef std::vector<int>::const_iterator const_iterator; + using iterator = std::vector<int>::iterator; + using const_iterator = std::vector<int>::const_iterator; iterator begin(); iterator end(); const_iterator begin() const; const_iterator end() const; - typedef std::vector<int>::reverse_iterator reverse_iterator; - typedef std::vector<int>::const_reverse_iterator const_reverse_iterator; + using reverse_iterator = std::vector<int>::reverse_iterator; + using const_reverse_iterator = std::vector<int>::const_reverse_iterator; reverse_iterator rbegin() { return Vec.rbegin(); } reverse_iterator rend() { return Vec.rend(); } const_reverse_iterator rbegin() const { return Vec.rbegin(); } @@ -80,7 +80,7 @@ class CustomIteratorVector { public: CustomIteratorVector(std::initializer_list<int> list) : V(list) {} - typedef std::vector<int>::iterator iterator; + using iterator = std::vector<int>::iterator; class reverse_iterator { std::vector<int>::iterator I; @@ -126,8 +126,8 @@ template <typename R> void TestRev(const R &r) { // Test fixture template <typename T> class RangeAdapterLValueTest : public ::testing::Test {}; -typedef ::testing::Types<std::vector<int>, std::list<int>, int[4]> - RangeAdapterLValueTestTypes; +using RangeAdapterLValueTestTypes = + ::testing::Types<std::vector<int>, std::list<int>, int[4]>; TYPED_TEST_SUITE(RangeAdapterLValueTest, RangeAdapterLValueTestTypes, ); TYPED_TEST(RangeAdapterLValueTest, TrivialOperation) { @@ -140,10 +140,10 @@ TYPED_TEST(RangeAdapterLValueTest, TrivialOperation) { template <typename T> struct RangeAdapterRValueTest : testing::Test {}; -typedef ::testing::Types<std::vector<int>, std::list<int>, CustomIteratorVector, - ReverseOnlyVector, BidirectionalVector, - BidirectionalVectorConsts> - RangeAdapterRValueTestTypes; +using RangeAdapterRValueTestTypes = + ::testing::Types<std::vector<int>, std::list<int>, CustomIteratorVector, + ReverseOnlyVector, BidirectionalVector, + BidirectionalVectorConsts>; TYPED_TEST_SUITE(RangeAdapterRValueTest, RangeAdapterRValueTestTypes, ); TYPED_TEST(RangeAdapterRValueTest, TrivialOperation) { diff --git a/llvm/unittests/ADT/SCCIteratorTest.cpp b/llvm/unittests/ADT/SCCIteratorTest.cpp index 48350959d046b..5f088294b1a2d 100644 --- a/llvm/unittests/ADT/SCCIteratorTest.cpp +++ b/llvm/unittests/ADT/SCCIteratorTest.cpp @@ -21,7 +21,7 @@ TEST(SCCIteratorTest, AllSmallGraphs) { // create graphs for which every node has a self-edge. #define NUM_NODES 4 #define NUM_GRAPHS (NUM_NODES * (NUM_NODES - 1)) - typedef Graph<NUM_NODES> GT; + using GT = Graph<NUM_NODES>; /// Enumerate all graphs using NUM_GRAPHS bits. static_assert(NUM_GRAPHS < sizeof(unsigned) * CHAR_BIT, "Too many graphs!"); diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp index 966b1f01e8a31..85567775e4ebd 100644 --- a/llvm/unittests/ADT/STLExtrasTest.cpp +++ b/llvm/unittests/ADT/STLExtrasTest.cpp @@ -60,7 +60,7 @@ TEST(STLExtrasTest, EnumerateLValue) { // Test that a simple LValue can be enumerated and gives correct results with // multiple types, including the empty container. std::vector<char> foo = {'a', 'b', 'c'}; - typedef std::pair<std::size_t, char> CharPairType; + using CharPairType = std::pair<std::size_t, char>; std::vector<CharPairType> CharResults; for (auto [index, value] : llvm::enumerate(foo)) { @@ -72,7 +72,7 @@ TEST(STLExtrasTest, EnumerateLValue) { CharPairType(2u, 'c'))); // Test a const range of a different type. - typedef std::pair<std::size_t, int> IntPairType; + using IntPairType = std::pair<std::size_t, int>; std::vector<IntPairType> IntResults; const std::vector<int> bar = {1, 2, 3}; for (auto [index, value] : llvm::enumerate(bar)) { @@ -111,7 +111,7 @@ TEST(STLExtrasTest, EnumerateModifyLValue) { TEST(STLExtrasTest, EnumerateRValueRef) { // Test that an rvalue can be enumerated. - typedef std::pair<std::size_t, int> PairType; + using PairType = std::pair<std::size_t, int>; std::vector<PairType> Results; auto Enumerator = llvm::enumerate(std::vector<int>{1, 2, 3}); @@ -138,7 +138,7 @@ TEST(STLExtrasTest, EnumerateModifyRValue) { // Test that when enumerating an rvalue, modification still works (even if // this isn't terribly useful, it at least shows that we haven't snuck an // extra const in there somewhere. - typedef std::pair<std::size_t, char> PairType; + using PairType = std::pair<std::size_t, char>; std::vector<PairType> Results; for (auto X : llvm::enumerate(std::vector<char>{'1', '2', '3'})) { diff --git a/llvm/unittests/ADT/SimpleIListTest.cpp b/llvm/unittests/ADT/SimpleIListTest.cpp index c2992baf8a5f7..cf3df8c293e25 100644 --- a/llvm/unittests/ADT/SimpleIListTest.cpp +++ b/llvm/unittests/ADT/SimpleIListTest.cpp @@ -605,8 +605,8 @@ struct Tag2 {}; struct DoubleNode : ilist_node<DoubleNode, ilist_tag<Tag1>>, ilist_node<DoubleNode, ilist_tag<Tag2>> { - typedef ilist_node<DoubleNode, ilist_tag<Tag1>> Node1Type; - typedef ilist_node<DoubleNode, ilist_tag<Tag2>> Node2Type; + using Node1Type = ilist_node<DoubleNode, ilist_tag<Tag1>>; + using Node2Type = ilist_node<DoubleNode, ilist_tag<Tag2>>; Node1Type::self_iterator getIterator1() { return Node1Type::getIterator(); } Node2Type::self_iterator getIterator2() { return Node2Type::getIterator(); } @@ -617,8 +617,8 @@ struct DoubleNode : ilist_node<DoubleNode, ilist_tag<Tag1>>, return Node2Type::getIterator(); } }; -typedef simple_ilist<DoubleNode, ilist_tag<Tag1>> TaggedList1Type; -typedef simple_ilist<DoubleNode, ilist_tag<Tag2>> TaggedList2Type; +using TaggedList1Type = simple_ilist<DoubleNode, ilist_tag<Tag1>>; +using TaggedList2Type = simple_ilist<DoubleNode, ilist_tag<Tag2>>; TEST(SimpleIListTest, TaggedLists) { TaggedList1Type L1; diff --git a/llvm/unittests/ADT/SmallPtrSetTest.cpp b/llvm/unittests/ADT/SmallPtrSetTest.cpp index a627091b90c70..fe7a8279d06b1 100644 --- a/llvm/unittests/ADT/SmallPtrSetTest.cpp +++ b/llvm/unittests/ADT/SmallPtrSetTest.cpp @@ -57,7 +57,7 @@ TEST(SmallPtrSetTest, GrowthTest) { SmallPtrSet<int *, 4> s; - typedef SmallPtrSet<int *, 4>::iterator iter; + using iter = SmallPtrSet<int *, 4>::iterator; s.insert(&buf[0]); s.insert(&buf[1]); diff --git a/llvm/unittests/ADT/SmallStringTest.cpp b/llvm/unittests/ADT/SmallStringTest.cpp index 2f4df8afeafa5..db858246c9bbf 100644 --- a/llvm/unittests/ADT/SmallStringTest.cpp +++ b/llvm/unittests/ADT/SmallStringTest.cpp @@ -23,7 +23,7 @@ namespace { // Test fixture class class SmallStringTest : public testing::Test { protected: - typedef SmallString<40> StringType; + using StringType = SmallString<40>; StringType theString; diff --git a/llvm/unittests/ADT/SmallVectorTest.cpp b/llvm/unittests/ADT/SmallVectorTest.cpp index 74fc737f29335..dbc626db54482 100644 --- a/llvm/unittests/ADT/SmallVectorTest.cpp +++ b/llvm/unittests/ADT/SmallVectorTest.cpp @@ -226,13 +226,10 @@ class SmallVectorTest : public SmallVectorTestBase { VectorT otherVector; }; - -typedef ::testing::Types<SmallVector<Constructable, 0>, - SmallVector<Constructable, 1>, - SmallVector<Constructable, 2>, - SmallVector<Constructable, 4>, - SmallVector<Constructable, 5> - > SmallVectorTestTypes; +using SmallVectorTestTypes = ::testing::Types< + SmallVector<Constructable, 0>, SmallVector<Constructable, 1>, + SmallVector<Constructable, 2>, SmallVector<Constructable, 4>, + SmallVector<Constructable, 5>>; TYPED_TEST_SUITE(SmallVectorTest, SmallVectorTestTypes, ); // Constructor test. @@ -537,11 +534,11 @@ TYPED_TEST(SmallVectorTest, AppendNonIterTest) { } struct output_iterator { - typedef std::output_iterator_tag iterator_category; - typedef int value_type; - typedef int difference_type; - typedef value_type *pointer; - typedef value_type &reference; + using iterator_category = std::output_iterator_tag; + using value_type = int; + using difference_type = int; + using pointer = value_type *; + using reference = value_type &; operator int() { return 2; } operator Constructable() { return 7; } }; @@ -896,7 +893,7 @@ class DualSmallVectorsTest<std::pair<VectorT1, VectorT2>> : public SmallVectorTe VectorT2 otherVector; }; -typedef ::testing::Types< +using DualSmallVectorTestTypes = ::testing::Types< // Small mode -> Small mode. std::pair<SmallVector<Constructable, 4>, SmallVector<Constructable, 4>>, // Small mode -> Big mode. @@ -904,8 +901,7 @@ typedef ::testing::Types< // Big mode -> Small mode. std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 4>>, // Big mode -> Big mode. - std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>> - > DualSmallVectorTestTypes; + std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>>>; TYPED_TEST_SUITE(DualSmallVectorsTest, DualSmallVectorTestTypes, ); diff --git a/llvm/unittests/ADT/SparseMultiSetTest.cpp b/llvm/unittests/ADT/SparseMultiSetTest.cpp index 54f7bc99b52fa..91d37f4684b9e 100644 --- a/llvm/unittests/ADT/SparseMultiSetTest.cpp +++ b/llvm/unittests/ADT/SparseMultiSetTest.cpp @@ -13,7 +13,7 @@ using namespace llvm; namespace { -typedef SparseMultiSet<unsigned> USet; +using USet = SparseMultiSet<unsigned>; // Empty set tests. TEST(SparseMultiSetTest, EmptySet) { @@ -211,7 +211,7 @@ struct Alt { }; TEST(SparseMultiSetTest, AltStructSet) { - typedef SparseMultiSet<Alt> ASet; + using ASet = SparseMultiSet<Alt>; ASet Set; Set.setUniverse(10); Set.insert(Alt(1005)); diff --git a/llvm/unittests/ADT/SparseSetTest.cpp b/llvm/unittests/ADT/SparseSetTest.cpp index 4fbf1caa247b7..f2b932907dc38 100644 --- a/llvm/unittests/ADT/SparseSetTest.cpp +++ b/llvm/unittests/ADT/SparseSetTest.cpp @@ -13,7 +13,7 @@ using namespace llvm; namespace { -typedef SparseSet<unsigned> USet; +using USet = SparseSet<unsigned>; // Empty set tests. TEST(SparseSetTest, EmptySet) { @@ -166,7 +166,7 @@ struct Alt { }; TEST(SparseSetTest, AltStructSet) { - typedef SparseSet<Alt> ASet; + using ASet = SparseSet<Alt>; ASet Set; Set.setUniverse(10); Set.insert(Alt(1005)); diff --git a/llvm/unittests/ADT/TestGraph.h b/llvm/unittests/ADT/TestGraph.h index a59ab504f7144..bb2ec47a0d5fe 100644 --- a/llvm/unittests/ADT/TestGraph.h +++ b/llvm/unittests/ADT/TestGraph.h @@ -34,7 +34,7 @@ class Graph { /// NodeSubset - A subset of the graph's nodes. class NodeSubset { - typedef unsigned char BitVector; // Where the limitation N <= 8 comes from. + using BitVector = unsigned char; // Where the limitation N <= 8 comes from. BitVector Elements; NodeSubset(BitVector e) : Elements(e) {} public: @@ -96,7 +96,7 @@ class Graph { }; /// NodeType - Node index and set of children of the node. - typedef std::pair<unsigned, NodeSubset> NodeType; + using NodeType = std::pair<unsigned, NodeSubset>; private: /// Nodes - The list of nodes for this graph. @@ -233,8 +233,8 @@ class Graph { template <unsigned N> struct GraphTraits<Graph<N> > { - typedef typename Graph<N>::NodeType *NodeRef; - typedef typename Graph<N>::ChildIterator ChildIteratorType; + using NodeRef = typename Graph<N>::NodeType *; + using ChildIteratorType = typename Graph<N>::ChildIterator; static NodeRef getEntryNode(const Graph<N> &G) { return G.AccessNode(0); } static ChildIteratorType child_begin(NodeRef Node) { diff --git a/llvm/unittests/ADT/TinyPtrVectorTest.cpp b/llvm/unittests/ADT/TinyPtrVectorTest.cpp index af4ae4f4b0db9..c77721df5055c 100644 --- a/llvm/unittests/ADT/TinyPtrVectorTest.cpp +++ b/llvm/unittests/ADT/TinyPtrVectorTest.cpp @@ -28,14 +28,14 @@ template <typename PointerTy, unsigned IntBits, typename IntType, typename PtrTraits, typename Info> struct RemovePointer< PointerIntPair<PointerTy, IntBits, IntType, PtrTraits, Info>> { - typedef typename RemovePointer<PointerTy>::type type; + using type = typename RemovePointer<PointerTy>::type; }; template <typename VectorT> class TinyPtrVectorTest : public testing::Test { protected: - typedef typename VectorT::value_type PtrT; - typedef typename RemovePointer<PtrT>::type ValueT; + using PtrT = typename VectorT::value_type; + using ValueT = typename RemovePointer<PtrT>::type; using PtrTraits = PointerLikeTypeTraits<PtrT>; VectorT V; @@ -78,9 +78,9 @@ class TinyPtrVectorTest : public testing::Test { } }; -typedef ::testing::Types<TinyPtrVector<int *>, TinyPtrVector<double *>, - TinyPtrVector<PointerIntPair<int *, 1>>> - TinyPtrVectorTestTypes; +using TinyPtrVectorTestTypes = + ::testing::Types<TinyPtrVector<int *>, TinyPtrVector<double *>, + TinyPtrVector<PointerIntPair<int *, 1>>>; TYPED_TEST_SUITE(TinyPtrVectorTest, TinyPtrVectorTestTypes, ); TYPED_TEST(TinyPtrVectorTest, EmptyTest) { From 5ed8f4847673f69dfe984a9ad653035a06e265bb Mon Sep 17 00:00:00 2001 From: Kazu Hirata <kazu@google.com> Date: Mon, 3 Nov 2025 08:41:32 -0800 Subject: [PATCH 034/313] [Support] Use "using" instead of "typedef" (NFC) (#166130) Identified with modernize-use-using. --- llvm/include/llvm/Support/Allocator.h | 2 +- llvm/include/llvm/Support/Atomic.h | 4 +-- llvm/include/llvm/Support/BinaryStreamArray.h | 8 ++--- llvm/include/llvm/Support/Chrono.h | 8 ++--- llvm/include/llvm/Support/ConvertUTF.h | 27 +++++++--------- llvm/include/llvm/Support/DebugCounter.h | 2 +- llvm/include/llvm/Support/ErrorHandling.h | 4 +-- .../llvm/Support/FormatVariadicDetails.h | 4 +-- llvm/include/llvm/Support/GenericLoopInfo.h | 16 +++++----- .../llvm/Support/GenericLoopInfoImpl.h | 6 ++-- llvm/include/llvm/Support/MD5.h | 2 +- llvm/include/llvm/Support/Mutex.h | 4 +-- llvm/include/llvm/Support/OnDiskHashTable.h | 32 +++++++++---------- .../llvm/Support/PointerLikeTypeTraits.h | 4 +-- llvm/include/llvm/Support/Program.h | 4 +-- llvm/include/llvm/Support/RISCVISAUtils.h | 4 +-- llvm/include/llvm/Support/RWMutex.h | 6 ++-- llvm/include/llvm/Support/Registry.h | 4 +-- llvm/include/llvm/Support/ScaledNumber.h | 6 ++-- llvm/include/llvm/Support/SuffixTree.h | 2 +- llvm/include/llvm/Support/Threading.h | 2 +- llvm/include/llvm/Support/TrailingObjects.h | 6 ++-- llvm/include/llvm/Support/UnicodeCharRanges.h | 2 +- llvm/include/llvm/Support/float128.h | 2 +- llvm/include/llvm/Support/thread.h | 2 +- llvm/lib/Support/BalancedPartitioning.cpp | 2 +- llvm/lib/Support/CommandLine.cpp | 8 ++--- llvm/lib/Support/DAGDeltaAlgorithm.cpp | 16 +++++----- llvm/lib/Support/DynamicLibrary.cpp | 2 +- llvm/lib/Support/Timer.cpp | 2 +- 30 files changed, 95 insertions(+), 98 deletions(-) diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h index bc0265904ef65..fffcbd9f3c1d8 100644 --- a/llvm/include/llvm/Support/Allocator.h +++ b/llvm/include/llvm/Support/Allocator.h @@ -380,7 +380,7 @@ class BumpPtrAllocatorImpl /// The standard BumpPtrAllocator which just uses the default template /// parameters. -typedef BumpPtrAllocatorImpl<> BumpPtrAllocator; +using BumpPtrAllocator = BumpPtrAllocatorImpl<>; /// A BumpPtrAllocator that allows only elements of a specific type to be /// allocated. diff --git a/llvm/include/llvm/Support/Atomic.h b/llvm/include/llvm/Support/Atomic.h index c2d9ae2da231c..3c62672a077f1 100644 --- a/llvm/include/llvm/Support/Atomic.h +++ b/llvm/include/llvm/Support/Atomic.h @@ -30,9 +30,9 @@ namespace llvm { LLVM_ABI void MemoryFence(); #ifdef _MSC_VER - typedef long cas_flag; + using cas_flag = long; #else - typedef uint32_t cas_flag; + using cas_flag = uint32_t; #endif LLVM_ABI cas_flag CompareAndSwap(volatile cas_flag *ptr, cas_flag new_value, cas_flag old_value); diff --git a/llvm/include/llvm/Support/BinaryStreamArray.h b/llvm/include/llvm/Support/BinaryStreamArray.h index ef2233c53ec2c..a7d03f6511f12 100644 --- a/llvm/include/llvm/Support/BinaryStreamArray.h +++ b/llvm/include/llvm/Support/BinaryStreamArray.h @@ -93,7 +93,7 @@ class VarStreamArray { friend class VarStreamArrayIterator<ValueType, Extractor>; public: - typedef VarStreamArrayIterator<ValueType, Extractor> Iterator; + using Iterator = VarStreamArrayIterator<ValueType, Extractor>; VarStreamArray() = default; @@ -156,8 +156,8 @@ template <typename ValueType, typename Extractor> class VarStreamArrayIterator : public iterator_facade_base<VarStreamArrayIterator<ValueType, Extractor>, std::forward_iterator_tag, const ValueType> { - typedef VarStreamArrayIterator<ValueType, Extractor> IterType; - typedef VarStreamArray<ValueType, Extractor> ArrayType; + using IterType = VarStreamArrayIterator<ValueType, Extractor>; + using ArrayType = VarStreamArray<ValueType, Extractor>; public: VarStreamArrayIterator(const ArrayType &Array, const Extractor &E, @@ -260,7 +260,7 @@ template <typename T> class FixedStreamArray { friend class FixedStreamArrayIterator<T>; public: - typedef FixedStreamArrayIterator<T> Iterator; + using Iterator = FixedStreamArrayIterator<T>; FixedStreamArray() = default; explicit FixedStreamArray(BinaryStreamRef Stream) : Stream(Stream) { diff --git a/llvm/include/llvm/Support/Chrono.h b/llvm/include/llvm/Support/Chrono.h index 5b8102d8e11cf..e5f98249cc074 100644 --- a/llvm/include/llvm/Support/Chrono.h +++ b/llvm/include/llvm/Support/Chrono.h @@ -150,10 +150,10 @@ template <> struct unit<std::nano> { template <typename Rep, typename Period> struct format_provider<std::chrono::duration<Rep, Period>> { private: - typedef std::chrono::duration<Rep, Period> Dur; - typedef std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value, - double, intmax_t> - InternalRep; + using Dur = std::chrono::duration<Rep, Period>; + using InternalRep = + std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value, + double, intmax_t>; template <typename AsPeriod> static InternalRep getAs(const Dur &D) { using namespace std::chrono; diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h index bb1723518a490..ddf7057bff59d 100644 --- a/llvm/include/llvm/Support/ConvertUTF.h +++ b/llvm/include/llvm/Support/ConvertUTF.h @@ -126,10 +126,10 @@ namespace llvm { bit mask & shift operations. ------------------------------------------------------------------------ */ -typedef unsigned int UTF32; /* at least 32 bits */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ -typedef unsigned char Boolean; /* 0 or 1 */ +using UTF32 = unsigned int; /* at least 32 bits */ +using UTF16 = unsigned short; /* at least 16 bits */ +using UTF8 = unsigned char; /* typically 8 bits */ +using Boolean = unsigned char; /* 0 or 1 */ /* Some fundamental constants */ #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD @@ -146,17 +146,14 @@ typedef unsigned char Boolean; /* 0 or 1 */ #define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF #define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000 -typedef enum { - conversionOK, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ -} ConversionResult; - -typedef enum { - strictConversion = 0, - lenientConversion -} ConversionFlags; +enum ConversionResult { + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ +}; + +enum ConversionFlags { strictConversion = 0, lenientConversion }; LLVM_ABI ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h index 39a08d499b67e..9904a0dd86559 100644 --- a/llvm/include/llvm/Support/DebugCounter.h +++ b/llvm/include/llvm/Support/DebugCounter.h @@ -140,7 +140,7 @@ class DebugCounter { } // Iterate through the registered counters - typedef UniqueVector<std::string> CounterVector; + using CounterVector = UniqueVector<std::string>; CounterVector::const_iterator begin() const { return RegisteredCounters.begin(); } diff --git a/llvm/include/llvm/Support/ErrorHandling.h b/llvm/include/llvm/Support/ErrorHandling.h index 4c17b6e83acd2..a4fd008a9ff3f 100644 --- a/llvm/include/llvm/Support/ErrorHandling.h +++ b/llvm/include/llvm/Support/ErrorHandling.h @@ -21,8 +21,8 @@ class StringRef; class Twine; /// An error handler callback. -typedef void (*fatal_error_handler_t)(void *user_data, const char *reason, - bool gen_crash_diag); +using fatal_error_handler_t = void (*)(void *user_data, const char *reason, + bool gen_crash_diag); /// install_fatal_error_handler - Installs a new error handler to be used /// whenever a serious (non-recoverable) error is encountered by LLVM. diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h index 0fdc7b6f94da7..c0b245e297a58 100644 --- a/llvm/include/llvm/Support/FormatVariadicDetails.h +++ b/llvm/include/llvm/Support/FormatVariadicDetails.h @@ -63,8 +63,8 @@ template <typename T> class missing_format_adapter; template <class T> class has_FormatProvider { public: using Decayed = std::decay_t<T>; - typedef void (*Signature_format)(const Decayed &, llvm::raw_ostream &, - StringRef); + using Signature_format = void (*)(const Decayed &, llvm::raw_ostream &, + StringRef); template <typename U> using check = SameType<Signature_format, &U::format>; diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h index b6bb360d9868f..9e2f61fd03e78 100644 --- a/llvm/include/llvm/Support/GenericLoopInfo.h +++ b/llvm/include/llvm/Support/GenericLoopInfo.h @@ -150,9 +150,9 @@ template <class BlockT, class LoopT> class LoopBase { assert(!isInvalid() && "Loop not in a valid state!"); return SubLoops; } - typedef typename std::vector<LoopT *>::const_iterator iterator; - typedef - typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator; + using iterator = typename std::vector<LoopT *>::const_iterator; + using reverse_iterator = + typename std::vector<LoopT *>::const_reverse_iterator; iterator begin() const { return getSubLoops().begin(); } iterator end() const { return getSubLoops().end(); } reverse_iterator rbegin() const { return getSubLoops().rbegin(); } @@ -174,7 +174,7 @@ template <class BlockT, class LoopT> class LoopBase { assert(!isInvalid() && "Loop not in a valid state!"); return Blocks; } - typedef typename ArrayRef<BlockT *>::const_iterator block_iterator; + using block_iterator = typename ArrayRef<BlockT *>::const_iterator; block_iterator block_begin() const { return getBlocks().begin(); } block_iterator block_end() const { return getBlocks().end(); } inline iterator_range<block_iterator> blocks() const { @@ -302,7 +302,7 @@ template <class BlockT, class LoopT> class LoopBase { bool hasNoExitBlocks() const; /// Edge type. - typedef std::pair<BlockT *, BlockT *> Edge; + using Edge = std::pair<BlockT *, BlockT *>; /// Return all pairs of (_inside_block_,_outside_block_). void getExitEdges(SmallVectorImpl<Edge> &ExitEdges) const; @@ -575,9 +575,9 @@ template <class BlockT, class LoopT> class LoopInfoBase { /// iterator/begin/end - The interface to the top-level loops in the current /// function. /// - typedef typename std::vector<LoopT *>::const_iterator iterator; - typedef - typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator; + using iterator = typename std::vector<LoopT *>::const_iterator; + using reverse_iterator = + typename std::vector<LoopT *>::const_reverse_iterator; iterator begin() const { return TopLevelLoops.begin(); } iterator end() const { return TopLevelLoops.end(); } reverse_iterator rbegin() const { return TopLevelLoops.rbegin(); } diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h index 541678001a8ff..c830f0a67a448 100644 --- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h +++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h @@ -459,7 +459,7 @@ template <class BlockT, class LoopT> static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges, LoopInfoBase<BlockT, LoopT> *LI, const DomTreeBase<BlockT> &DomTree) { - typedef GraphTraits<Inverse<BlockT *>> InvBlockTraits; + using InvBlockTraits = GraphTraits<Inverse<BlockT *>>; unsigned NumBlocks = 0; unsigned NumSubloops = 0; @@ -513,8 +513,8 @@ static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges, /// Populate all loop data in a stable order during a single forward DFS. template <class BlockT, class LoopT> class PopulateLoopsDFS { - typedef GraphTraits<BlockT *> BlockTraits; - typedef typename BlockTraits::ChildIteratorType SuccIterTy; + using BlockTraits = GraphTraits<BlockT *>; + using SuccIterTy = typename BlockTraits::ChildIteratorType; LoopInfoBase<BlockT, LoopT> *LI; diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h index 4ba386753f397..dbcb66d7680e4 100644 --- a/llvm/include/llvm/Support/MD5.h +++ b/llvm/include/llvm/Support/MD5.h @@ -90,7 +90,7 @@ class MD5 { private: // Any 32-bit or wider unsigned integer data type will do. - typedef uint32_t MD5_u32plus; + using MD5_u32plus = uint32_t; // Internal State struct { diff --git a/llvm/include/llvm/Support/Mutex.h b/llvm/include/llvm/Support/Mutex.h index d61e3fd96efbe..3ca5c9a2f6be8 100644 --- a/llvm/include/llvm/Support/Mutex.h +++ b/llvm/include/llvm/Support/Mutex.h @@ -63,12 +63,12 @@ namespace llvm }; /// Mutex - A standard, always enforced mutex. - typedef SmartMutex<false> Mutex; + using Mutex = SmartMutex<false>; template <bool mt_only> using SmartScopedLock = std::lock_guard<SmartMutex<mt_only>>; - typedef SmartScopedLock<false> ScopedLock; + using ScopedLock = SmartScopedLock<false>; } } diff --git a/llvm/include/llvm/Support/OnDiskHashTable.h b/llvm/include/llvm/Support/OnDiskHashTable.h index d7d72cfbbc649..54c6b713478b9 100644 --- a/llvm/include/llvm/Support/OnDiskHashTable.h +++ b/llvm/include/llvm/Support/OnDiskHashTable.h @@ -69,7 +69,7 @@ template <typename Info> class OnDiskChainedHashTableGenerator { : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {} }; - typedef typename Info::offset_type offset_type; + using offset_type = typename Info::offset_type; offset_type NumBuckets; offset_type NumEntries; llvm::SpecificBumpPtrAllocator<Item> BA; @@ -278,12 +278,12 @@ template <typename Info> class OnDiskChainedHashTable { Info InfoObj; public: - typedef Info InfoType; - typedef typename Info::internal_key_type internal_key_type; - typedef typename Info::external_key_type external_key_type; - typedef typename Info::data_type data_type; - typedef typename Info::hash_value_type hash_value_type; - typedef typename Info::offset_type offset_type; + using InfoType = Info; + using internal_key_type = typename Info::internal_key_type; + using external_key_type = typename Info::external_key_type; + using data_type = typename Info::data_type; + using hash_value_type = typename Info::hash_value_type; + using offset_type = typename Info::offset_type; OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries, const unsigned char *Buckets, @@ -435,12 +435,12 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { const unsigned char *Payload; public: - typedef OnDiskChainedHashTable<Info> base_type; - typedef typename base_type::internal_key_type internal_key_type; - typedef typename base_type::external_key_type external_key_type; - typedef typename base_type::data_type data_type; - typedef typename base_type::hash_value_type hash_value_type; - typedef typename base_type::offset_type offset_type; + using base_type = OnDiskChainedHashTable<Info>; + using internal_key_type = typename base_type::internal_key_type; + using external_key_type = typename base_type::external_key_type; + using data_type = typename base_type::data_type; + using hash_value_type = typename base_type::hash_value_type; + using offset_type = typename base_type::offset_type; private: /// Iterates over all of the keys in the table. @@ -450,7 +450,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { offset_type NumEntriesLeft; public: - typedef external_key_type value_type; + using value_type = external_key_type; iterator_base(const unsigned char *const Ptr, offset_type NumEntries) : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries) {} @@ -505,7 +505,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { Info *InfoObj; public: - typedef external_key_type value_type; + using value_type = external_key_type; key_iterator(const unsigned char *const Ptr, offset_type NumEntries, Info *InfoObj) @@ -551,7 +551,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { Info *InfoObj; public: - typedef data_type value_type; + using value_type = data_type; data_iterator(const unsigned char *const Ptr, offset_type NumEntries, Info *InfoObj) diff --git a/llvm/include/llvm/Support/PointerLikeTypeTraits.h b/llvm/include/llvm/Support/PointerLikeTypeTraits.h index 320f6b63b447e..a47d68406acf3 100644 --- a/llvm/include/llvm/Support/PointerLikeTypeTraits.h +++ b/llvm/include/llvm/Support/PointerLikeTypeTraits.h @@ -70,7 +70,7 @@ template <> struct PointerLikeTypeTraits<void *> { // Provide PointerLikeTypeTraits for const things. template <typename T> struct PointerLikeTypeTraits<const T> { - typedef PointerLikeTypeTraits<T> NonConst; + using NonConst = PointerLikeTypeTraits<T>; static inline const void *getAsVoidPointer(const T P) { return NonConst::getAsVoidPointer(P); @@ -83,7 +83,7 @@ template <typename T> struct PointerLikeTypeTraits<const T> { // Provide PointerLikeTypeTraits for const pointers. template <typename T> struct PointerLikeTypeTraits<const T *> { - typedef PointerLikeTypeTraits<T *> NonConst; + using NonConst = PointerLikeTypeTraits<T *>; static inline const void *getAsVoidPointer(const T *P) { return NonConst::getAsVoidPointer(const_cast<T *>(P)); diff --git a/llvm/include/llvm/Support/Program.h b/llvm/include/llvm/Support/Program.h index 53c2e7597b2b4..575e416587ea8 100644 --- a/llvm/include/llvm/Support/Program.h +++ b/llvm/include/llvm/Support/Program.h @@ -39,8 +39,8 @@ const char EnvPathSeparator = ';'; typedef unsigned long procid_t; // Must match the type of DWORD on Windows. typedef void *process_t; // Must match the type of HANDLE on Windows. #else -typedef ::pid_t procid_t; -typedef procid_t process_t; +using procid_t = ::pid_t; +using process_t = procid_t; #endif /// This struct encapsulates information about a process. diff --git a/llvm/include/llvm/Support/RISCVISAUtils.h b/llvm/include/llvm/Support/RISCVISAUtils.h index 165bb08d66431..05fd32e0e7cfe 100644 --- a/llvm/include/llvm/Support/RISCVISAUtils.h +++ b/llvm/include/llvm/Support/RISCVISAUtils.h @@ -40,8 +40,8 @@ struct ExtensionComparator { /// OrderedExtensionMap is std::map, it's specialized to keep entries /// in canonical order of extension. -typedef std::map<std::string, ExtensionVersion, ExtensionComparator> - OrderedExtensionMap; +using OrderedExtensionMap = + std::map<std::string, ExtensionVersion, ExtensionComparator>; } // namespace RISCVISAUtils diff --git a/llvm/include/llvm/Support/RWMutex.h b/llvm/include/llvm/Support/RWMutex.h index 8d221aaab9ab9..efc1ca19a1208 100644 --- a/llvm/include/llvm/Support/RWMutex.h +++ b/llvm/include/llvm/Support/RWMutex.h @@ -162,7 +162,7 @@ template <bool mt_only> class SmartRWMutex { bool try_lock() { return impl.try_lock(); } }; -typedef SmartRWMutex<false> RWMutex; +using RWMutex = SmartRWMutex<false>; /// ScopedReader - RAII acquisition of a reader lock #if !defined(LLVM_USE_RW_MUTEX_IMPL) @@ -179,7 +179,7 @@ template <bool mt_only> struct SmartScopedReader { ~SmartScopedReader() { mutex.unlock_shared(); } }; #endif -typedef SmartScopedReader<false> ScopedReader; +using ScopedReader = SmartScopedReader<false>; /// ScopedWriter - RAII acquisition of a writer lock #if !defined(LLVM_USE_RW_MUTEX_IMPL) @@ -196,7 +196,7 @@ template <bool mt_only> struct SmartScopedWriter { ~SmartScopedWriter() { mutex.unlock(); } }; #endif -typedef SmartScopedWriter<false> ScopedWriter; +using ScopedWriter = SmartScopedWriter<false>; } // end namespace sys } // end namespace llvm diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h index c02f15e5e32b8..acd3b06fde6e7 100644 --- a/llvm/include/llvm/Support/Registry.h +++ b/llvm/include/llvm/Support/Registry.h @@ -43,8 +43,8 @@ namespace llvm { template <typename T> class Registry { public: - typedef T type; - typedef SimpleRegistryEntry<T> entry; + using type = T; + using entry = SimpleRegistryEntry<T>; class node; class iterator; diff --git a/llvm/include/llvm/Support/ScaledNumber.h b/llvm/include/llvm/Support/ScaledNumber.h index 07baf153e10c6..8ca8d457e339e 100644 --- a/llvm/include/llvm/Support/ScaledNumber.h +++ b/llvm/include/llvm/Support/ScaledNumber.h @@ -498,10 +498,10 @@ template <class DigitsT> class ScaledNumber : ScaledNumberBase { static_assert(!std::numeric_limits<DigitsT>::is_signed, "only unsigned floats supported"); - typedef DigitsT DigitsType; + using DigitsType = DigitsT; private: - typedef std::numeric_limits<DigitsType> DigitsLimits; + using DigitsLimits = std::numeric_limits<DigitsType>; static constexpr int Width = sizeof(DigitsType) * 8; static_assert(Width <= 64, "invalid integer width for digits"); @@ -782,7 +782,7 @@ uint64_t ScaledNumber<DigitsT>::scale(uint64_t N) const { template <class DigitsT> template <class IntT> IntT ScaledNumber<DigitsT>::toInt() const { - typedef std::numeric_limits<IntT> Limits; + using Limits = std::numeric_limits<IntT>; if (*this < 1) return 0; if (*this >= Limits::max()) diff --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h index 4c78235abf508..eac66d84d6f63 100644 --- a/llvm/include/llvm/Support/SuffixTree.h +++ b/llvm/include/llvm/Support/SuffixTree.h @@ -219,7 +219,7 @@ class SuffixTree { } }; - typedef RepeatedSubstringIterator iterator; + using iterator = RepeatedSubstringIterator; iterator begin() { return iterator(Root, LeafNodes); } iterator end() { return iterator(nullptr); } }; diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h index 88846807f111a..89d90b3438e92 100644 --- a/llvm/include/llvm/Support/Threading.h +++ b/llvm/include/llvm/Support/Threading.h @@ -53,7 +53,7 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; } #if LLVM_THREADING_USE_STD_CALL_ONCE - typedef std::once_flag once_flag; +using once_flag = std::once_flag; #else diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h index c47976524dcd9..218c2e336d77b 100644 --- a/llvm/include/llvm/Support/TrailingObjects.h +++ b/llvm/include/llvm/Support/TrailingObjects.h @@ -76,7 +76,7 @@ class TrailingObjectsBase { // number of a different type. e.g.: // ExtractSecondType<Foo..., int>::type template <typename Ty1, typename Ty2> struct ExtractSecondType { - typedef Ty2 type; + using type = Ty2; }; // TrailingObjectsImpl is somewhat complicated, because it is a @@ -101,8 +101,8 @@ class TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, PrevTy, NextTy, : public TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...> { - typedef TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...> - ParentType; + using ParentType = + TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...>; struct RequiresRealignment { static const bool value = alignof(PrevTy) < alignof(NextTy); diff --git a/llvm/include/llvm/Support/UnicodeCharRanges.h b/llvm/include/llvm/Support/UnicodeCharRanges.h index 7f1a9b3ff0c3b..2b5fc83d34690 100644 --- a/llvm/include/llvm/Support/UnicodeCharRanges.h +++ b/llvm/include/llvm/Support/UnicodeCharRanges.h @@ -37,7 +37,7 @@ inline bool operator<(UnicodeCharRange Range, uint32_t Value) { /// array. class UnicodeCharSet { public: - typedef ArrayRef<UnicodeCharRange> CharRanges; + using CharRanges = ArrayRef<UnicodeCharRange>; /// Constructs a UnicodeCharSet instance from an array of /// UnicodeCharRanges. diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h index e15a98dc5a677..ffad1241c3e3d 100644 --- a/llvm/include/llvm/Support/float128.h +++ b/llvm/include/llvm/Support/float128.h @@ -14,7 +14,7 @@ namespace llvm { #if defined(__clang__) && defined(__FLOAT128__) && \ defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__) #define HAS_IEE754_FLOAT128 -typedef __float128 float128; +using float128 = __float128; #elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) && \ !defined(__LONG_DOUBLE_IBM128__) && \ (defined(__GNUC__) || defined(__GNUG__)) diff --git a/llvm/include/llvm/Support/thread.h b/llvm/include/llvm/Support/thread.h index 16e322bfd8785..ecde62d8368e7 100644 --- a/llvm/include/llvm/Support/thread.h +++ b/llvm/include/llvm/Support/thread.h @@ -127,7 +127,7 @@ LLVM_ABI thread::id llvm_thread_get_current_id_impl(); template <class Function, class... Args> thread::thread(std::optional<unsigned> StackSizeInBytes, Function &&f, Args &&...args) { - typedef std::tuple<std::decay_t<Function>, std::decay_t<Args>...> CalleeTuple; + using CalleeTuple = std::tuple<std::decay_t<Function>, std::decay_t<Args>...>; std::unique_ptr<CalleeTuple> Callee( new CalleeTuple(std::forward<Function>(f), std::forward<Args>(args)...)); diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp index 1914f4cc39d96..d859abddbcad8 100644 --- a/llvm/lib/Support/BalancedPartitioning.cpp +++ b/llvm/lib/Support/BalancedPartitioning.cpp @@ -231,7 +231,7 @@ unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes, } // Compute move gains - typedef std::pair<float, BPFunctionNode *> GainPair; + using GainPair = std::pair<float, BPFunctionNode *>; std::vector<GainPair> Gains; for (auto &N : Nodes) { bool FromLeftToRight = (N.Bucket == LeftBucket); diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index de5bd795403dc..dab8beeff7ca5 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -2343,10 +2343,10 @@ namespace { class HelpPrinter { protected: const bool ShowHidden; - typedef SmallVector<std::pair<const char *, Option *>, 128> - StrOptionPairVector; - typedef SmallVector<std::pair<const char *, SubCommand *>, 128> - StrSubCommandPairVector; + using StrOptionPairVector = + SmallVector<std::pair<const char *, Option *>, 128>; + using StrSubCommandPairVector = + SmallVector<std::pair<const char *, SubCommand *>, 128>; // Print the options. Opts is assumed to be alphabetically sorted. virtual void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) { for (const auto &Opt : Opts) diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp index 981536473d124..3bfae147d18c0 100644 --- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp +++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp @@ -47,16 +47,16 @@ class DAGDeltaAlgorithmImpl { friend class DeltaActiveSetHelper; public: - typedef DAGDeltaAlgorithm::change_ty change_ty; - typedef DAGDeltaAlgorithm::changeset_ty changeset_ty; - typedef DAGDeltaAlgorithm::changesetlist_ty changesetlist_ty; - typedef DAGDeltaAlgorithm::edge_ty edge_ty; + using change_ty = DAGDeltaAlgorithm::change_ty; + using changeset_ty = DAGDeltaAlgorithm::changeset_ty; + using changesetlist_ty = DAGDeltaAlgorithm::changesetlist_ty; + using edge_ty = DAGDeltaAlgorithm::edge_ty; private: - typedef std::vector<change_ty>::iterator pred_iterator_ty; - typedef std::vector<change_ty>::iterator succ_iterator_ty; - typedef std::set<change_ty>::iterator pred_closure_iterator_ty; - typedef std::set<change_ty>::iterator succ_closure_iterator_ty; + using pred_iterator_ty = std::vector<change_ty>::iterator; + using succ_iterator_ty = std::vector<change_ty>::iterator; + using pred_closure_iterator_ty = std::set<change_ty>::iterator; + using succ_closure_iterator_ty = std::set<change_ty>::iterator; DAGDeltaAlgorithm &DDA; diff --git a/llvm/lib/Support/DynamicLibrary.cpp b/llvm/lib/Support/DynamicLibrary.cpp index f1c15c00cedea..61566d3722419 100644 --- a/llvm/lib/Support/DynamicLibrary.cpp +++ b/llvm/lib/Support/DynamicLibrary.cpp @@ -23,7 +23,7 @@ using namespace llvm::sys; // All methods for HandleSet should be used holding SymbolsMutex. class DynamicLibrary::HandleSet { - typedef std::vector<void *> HandleList; + using HandleList = std::vector<void *>; HandleList Handles; void *Process = &Invalid; diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp index 9d45096dddd97..b08f5083e00a8 100644 --- a/llvm/lib/Support/Timer.cpp +++ b/llvm/lib/Support/Timer.cpp @@ -207,7 +207,7 @@ void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const { namespace { -typedef StringMap<Timer> Name2TimerMap; +using Name2TimerMap = StringMap<Timer>; class Name2PairMap { StringMap<std::pair<TimerGroup*, Name2TimerMap> > Map; From bb14b831d20a5b22da017d94b046fa40316f8364 Mon Sep 17 00:00:00 2001 From: Kazu Hirata <kazu@google.com> Date: Mon, 3 Nov 2025 08:41:41 -0800 Subject: [PATCH 035/313] [llvm] Proofread MIRLangRef.rst (#166131) --- llvm/docs/MIRLangRef.rst | 56 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst index 3f4c3cde9b3aa..f7647c898c1e6 100644 --- a/llvm/docs/MIRLangRef.rst +++ b/llvm/docs/MIRLangRef.rst @@ -86,25 +86,25 @@ Tests are more accessible and future proof when simplified: - Use the ``-simplify-mir`` option with llc. - Machine function attributes often have default values or the test works just - as well with default values. Typical candidates for this are: `alignment:`, - `exposesReturnsTwice`, `legalized`, `regBankSelected`, `selected`. + as well with default values. Typical candidates for this are: ``alignment:``, + ``exposesReturnsTwice``, ``legalized``, ``regBankSelected``, ``selected``. The whole `frameInfo` section is often unnecessary if there is no special - frame usage in the function. `tracksRegLiveness` on the other hand is often + frame usage in the function. ``tracksRegLiveness`` on the other hand is often necessary for some passes that care about block livein lists. -- The (global) `liveins:` list is typically only interesting for early +- The (global) ``liveins:`` list is typically only interesting for early instruction selection passes and can be removed when testing later passes. - The per-block `liveins:` on the other hand are necessary if + The per-block ``liveins:`` on the other hand are necessary if `tracksRegLiveness` is true. -- Branch probability data in block `successors:` lists can be dropped if the +- Branch probability data in block ``successors:`` lists can be dropped if the test doesn't depend on it. Example: - `successors: %bb.1(0x40000000), %bb.2(0x40000000)` can be replaced with - `successors: %bb.1, %bb.2`. + ``successors: %bb.1(0x40000000), %bb.2(0x40000000)`` can be replaced with + ``successors: %bb.1, %bb.2``. - MIR code contains a whole IR module. This is necessary because there are no equivalents in MIR for global variables, references to external functions, - function attributes, metadata, debug info. Instead some MIR data references + function attributes, metadata, debug info. Instead, some MIR data references the IR constructs. You can often remove them if the test doesn't depend on them. @@ -114,16 +114,16 @@ Tests are more accessible and future proof when simplified: dropped: `:: (load 8)` - MIR blocks can reference IR blocks for debug printing, profile information, - or debug locations. Example: `bb.42.myblock` in MIR references the IR block - `myblock`. It is usually possible to drop the `.myblock` reference and simply - use `bb.42`. + or debug locations. Example: ``bb.42.myblock`` in MIR references the IR block + ``myblock``. It is usually possible to drop the ``.myblock`` reference and simply + use ``bb.42``. - If there are no memory operands or blocks referencing the IR, then the IR function can be replaced by a parameterless dummy function like - `define @func() { ret void }`. + ``define @func() { ret void }``. - It is possible to drop the whole IR section of the MIR file if it only - contains dummy functions (see above). The .mir loader will create the + contains dummy functions (see above). The ``.mir`` loader will create the IR functions automatically in this case. .. _limitations: @@ -131,7 +131,7 @@ Tests are more accessible and future proof when simplified: Limitations ----------- -Currently the MIR format has several limitations in terms of which state it +Currently, the MIR format has several limitations in terms of which state it can serialize: - The target-specific state in the target-specific ``MachineFunctionInfo`` @@ -150,7 +150,7 @@ These limitations impose restrictions on what you can test with the MIR format. For now, tests that would like to test some behaviour that depends on the state of temporary or local ``MCSymbol`` operands or the exception handling state in MMI, can't use the MIR format. As well as that, tests that test some behaviour -that depends on the state of the target specific ``MachineFunctionInfo`` or +that depends on the state of the target-specific ``MachineFunctionInfo`` or ``MachineConstantPoolValue`` subclasses can't use the MIR format at the moment. High Level Structure @@ -286,7 +286,7 @@ Example: Successors ^^^^^^^^^^ -The machine basic block's successors have to be specified before any of the +The machine basic block's successors must be specified before any of the instructions: .. code-block:: text @@ -489,13 +489,13 @@ In case this is true, the Machine Operand is printed according to the target. For example: -In AArch64RegisterInfo.td: +In ``AArch64RegisterInfo.td``: .. code-block:: text def sub_32 : SubRegIndex<32>; -If the third operand is an immediate with the value ``15`` (target-dependent +If the third operand is an immediate with the value ``15`` (a target-dependent value), based on the instruction's opcode and the operand's index the operand will be printed as ``%subreg.sub_32``: @@ -503,7 +503,7 @@ will be printed as ``%subreg.sub_32``: %1:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32 -For integers > 64 bits, we use a special machine operand, ``MO_CImmediate``, +For integers larger than 64 bits, we use a special machine operand, ``MO_CImmediate``, which stores the immediate in a ``ConstantInt`` using an ``APInt`` (LLVM's arbitrary-precision integers). @@ -552,7 +552,7 @@ corresponding internal ``llvm::RegState`` representation: * - ``implicit`` - ``RegState::Implicit`` - - Not emitted register (e.g. carry, or temporary result). + - Not emitted register (e.g., carry, or temporary result). * - ``implicit-def`` - ``RegState::ImplicitDefine`` @@ -625,7 +625,7 @@ For a CPI with the index 0 and offset -12: %1:gr64 = MOV64ri %const.0 - 12 -A constant pool entry is bound to a LLVM IR ``Constant`` or a target-specific +A constant pool entry is bound to an LLVM IR ``Constant`` or a target-specific ``MachineConstantPoolValue``. When serializing all the function's constants, the following format is used: @@ -670,12 +670,12 @@ a global value operand named ``G``: $rax = MOV64rm $rip, 1, _, @G, _ -The named global values are represented using an identifier with the '@' prefix. +The named global values are represented using an identifier with the ``@`` prefix. If the identifier doesn't match the regular expression -`[-a-zA-Z$._][-a-zA-Z$._0-9]*`, then this identifier must be quoted. +``[-a-zA-Z$._][-a-zA-Z$._0-9]*``, then this identifier must be quoted. The unnamed global values are represented using an unsigned numeric value with -the '@' prefix, like in the following examples: ``@0``, ``@989``. +the ``@`` prefix, as in the following examples: ``@0``, ``@989``. Target-dependent Index Operands ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -741,7 +741,7 @@ Example: MCSymbol Operands ^^^^^^^^^^^^^^^^^ -A MCSymbol operand holds a pointer to a ``MCSymbol``. For the limitations +An ``MCSymbol`` operand holds a pointer to an ``MCSymbol``. For the limitations of this operand in MIR, see :ref:`limitations <limitations>`. The syntax is: @@ -825,7 +825,7 @@ Comments ^^^^^^^^ Machine operands can have C/C++ style comments, which are annotations enclosed -between ``/*`` and ``*/`` to improve readability of e.g. immediate operands. +between ``/*`` and ``*/`` to improve readability of e.g., immediate operands. In the example below, ARM instructions EOR and BCC and immediate operands ``14`` and ``0`` have been annotated with their condition codes (CC) definitions, i.e. the ``always`` and ``eq`` condition codes: @@ -920,7 +920,7 @@ Instruction referencing locations This experimental feature aims to separate the specification of variable *values* from the program point where a variable takes on that value. Changes -in variable value occur in the same manner as ``DBG_VALUE`` meta instructions +in a variable value occur in the same manner as ``DBG_VALUE`` meta instructions but using ``DBG_INSTR_REF``. Variable values are identified by a pair of instruction number and operand number. Consider the example below: From 645a9ed7f26c7454b9973239a6b737ebdb42277b Mon Sep 17 00:00:00 2001 From: Aiden Grossman <aidengrossman@google.com> Date: Mon, 3 Nov 2025 09:03:54 -0800 Subject: [PATCH 036/313] [Github] Truncate Bug Emails when Necessary (#166081) Mailgun limits the size of an email payload to 16k. Truncate the issue body, which should be the largest part around 15k and point the user to Github to see the rest. Fixes #165020 --- .github/workflows/llvm-bugs.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml index 7d42abfadde7b..cd3f396e7c465 100644 --- a/.github/workflows/llvm-bugs.yml +++ b/.github/workflows/llvm-bugs.yml @@ -39,6 +39,12 @@ jobs: repo: context.repo.repo }) .then((issue) => { + var maybeTruncatedBody = issue.data.body; + if (maybeTruncatedBody.length > 15000) { + maybeTruncatedBody = maybeTruncatedBody.substring(0, + 15000) + + "<truncated>Please see the issue for the entire body." + } const payload = { author : issue.data.user.login, issue : issue.data.number, From 4cb8f97a0965f0a1138ab9f16aa00aff9b06c312 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu <min.hsu@sifive.com> Date: Mon, 3 Nov 2025 09:04:04 -0800 Subject: [PATCH 037/313] [clang][RISCV] Add C intrinsics for XSfvfexp* and XSfvfexpa* (#165792) Add C intrinsics for XSfvfexp16e/32e, XSfvfbfexp16e, and XSfvfexpa(64e) introduced in #164349 and #164499 Co-authored-by: Jesse Huang <jesse.huang@sifive.com> Co-authored-by: Craig Topper <craig.topper@sifive.com> --- .../clang/Basic/riscv_sifive_vector.td | 27 + .../non-policy/non-overloaded/sf_vfexp_v_16.c | 131 +++++ .../non-policy/non-overloaded/sf_vfexp_v_32.c | 111 ++++ .../non-policy/non-overloaded/sf_vfexp_v_bf.c | 135 +++++ .../non-policy/non-overloaded/sf_vfexpa_v.c | 234 +++++++++ .../non-overloaded/sf_vfexpa_v_64.c | 90 ++++ .../non-policy/overloaded/sf_vfexp_v_16.c | 131 +++++ .../non-policy/overloaded/sf_vfexp_v_32.c | 111 ++++ .../non-policy/overloaded/sf_vfexp_v_bf.c | 134 +++++ .../non-policy/overloaded/sf_vfexpa_v.c | 234 +++++++++ .../non-policy/overloaded/sf_vfexpa_v_64.c | 90 ++++ .../policy/non-overloaded/sf_vfexp_v_16.c | 248 +++++++++ .../policy/non-overloaded/sf_vfexp_v_32.c | 208 ++++++++ .../policy/non-overloaded/sf_vfexp_v_bf.c | 248 +++++++++ .../policy/non-overloaded/sf_vfexpa_v.c | 448 ++++++++++++++++ .../policy/non-overloaded/sf_vfexpa_v_64.c | 167 ++++++ .../policy/overloaded/sf_vfexp_v_16.c | 261 ++++++++++ .../policy/overloaded/sf_vfexp_v_32.c | 228 ++++++++ .../policy/overloaded/sf_vfexp_v_bf.c | 272 ++++++++++ .../policy/overloaded/sf_vfexpa_v.c | 492 ++++++++++++++++++ .../policy/overloaded/sf_vfexpa_v_64.c | 183 +++++++ 21 files changed, 4183 insertions(+) create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c diff --git a/clang/include/clang/Basic/riscv_sifive_vector.td b/clang/include/clang/Basic/riscv_sifive_vector.td index 89e644a078682..0371279aafc08 100644 --- a/clang/include/clang/Basic/riscv_sifive_vector.td +++ b/clang/include/clang/Basic/riscv_sifive_vector.td @@ -121,6 +121,13 @@ multiclass RVVVQMACCQOQBuiltinSet<list<list<string>> suffixes_prototypes> { defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "s", suffixes_prototypes>; } +multiclass RVVVFEXPBuiltinSet<list<list<string>> suffixes_prototypes, string type_range> { + let UnMaskedPolicyScheme = HasPassthruOperand, + OverloadedName = NAME, + Log2LMUL = [-2, -1, 0, 1, 2, 3] in + defm NAME : RVVOutBuiltinSet<NAME, type_range, suffixes_prototypes>; +} + multiclass RVVVFNRCLIPBuiltinSet<string suffix, string prototype, string type_range> { let Log2LMUL = [-3, -2, -1, 0, 1, 2], Name = NAME, @@ -145,6 +152,26 @@ let UnMaskedPolicyScheme = HasPolicyOperand in defm sf_vqmaccsu_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>; } +let RequiredFeatures = ["xsfvfbfexp16e"] in { + defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "y">; +} + +let RequiredFeatures = ["xsfvfexp16e"] in { + defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "x">; +} + +let RequiredFeatures = ["xsfvfexp32e"] in { + defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "f">; +} + +let RequiredFeatures = ["xsfvfexpa"] in { + defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "xf">; +} + +let RequiredFeatures = ["xsfvfexpa64e"] in { + defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "d">; +} + let UnMaskedPolicyScheme = HasPolicyOperand in let RequiredFeatures = ["xsfvfwmaccqqq"] in defm sf_vfwmacc_4x4x4 : RVVVFWMACCBuiltinSet<[["", "Fw", "FwFwSvv"]]>; diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c new file mode 100644 index 0000000000000..a0d5845208529 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c @@ -0,0 +1,131 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4( +// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2( +// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1( +// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2( +// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4( +// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8( +// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f16mf4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f16mf2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f16m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_m( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_m( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8_m(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c new file mode 100644 index 0000000000000..25d0991fa70cd --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c @@ -0,0 +1,111 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +xsfvfexp32e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2( +// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32mf2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1( +// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2( +// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4( +// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8( +// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f32mf2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f32m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f32m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_m( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8_m(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c new file mode 100644 index 0000000000000..9fc332a1469ff --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c @@ -0,0 +1,135 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4( +// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2( +// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1( +// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2( +// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4( +// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8( +// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_m( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_m( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16m8_m(vm, vs2, vl); +} + diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c new file mode 100644 index 0000000000000..67a9220bd011d --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c @@ -0,0 +1,234 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexpa -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4( +// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2( +// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1( +// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2( +// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4( +// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8( +// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2( +// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1( +// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2( +// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4( +// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8( +// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f16m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_m( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_m( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f32m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f32m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_m( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8_m(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c new file mode 100644 index 0000000000000..fd6f82db52953 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c @@ -0,0 +1,90 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \ +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1( +// CHECK-RV64-SAME: <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2( +// CHECK-RV64-SAME: <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4( +// CHECK-RV64-SAME: <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8( +// CHECK-RV64-SAME: <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f64m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f64m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f64m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8_m(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c new file mode 100644 index 0000000000000..0e769ed5fc5bc --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c @@ -0,0 +1,131 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4( +// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2( +// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1( +// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2( +// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4( +// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8( +// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_m( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_m( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c new file mode 100644 index 0000000000000..3df1eaa3a0467 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c @@ -0,0 +1,111 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +xsfvfexp32e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2( +// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1( +// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2( +// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4( +// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8( +// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_m( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c new file mode 100644 index 0000000000000..6179dbe8d82e4 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4( +// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2( +// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1( +// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2( +// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4( +// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8( +// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_m( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_m( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c new file mode 100644 index 0000000000000..1ddbb0b84520c --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c @@ -0,0 +1,234 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexpa -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4( +// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2( +// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1( +// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2( +// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4( +// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8( +// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2( +// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1( +// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2( +// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4( +// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8( +// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_m( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_m( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_m( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c new file mode 100644 index 0000000000000..165879a8bb589 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c @@ -0,0 +1,90 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \ +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1( +// CHECK-RV64-SAME: <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2( +// CHECK-RV64-SAME: <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4( +// CHECK-RV64-SAME: <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8( +// CHECK-RV64-SAME: <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_m( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_m( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_m( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_m( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c new file mode 100644 index 0000000000000..aed6d87a4b18a --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c @@ -0,0 +1,248 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tu( +// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tu( +// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tu( +// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tu( +// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tu( +// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tu( +// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tum( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tum( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tumu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tumu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_mu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_mu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c new file mode 100644 index 0000000000000..374f324cc0808 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c @@ -0,0 +1,208 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +xsfvfexp32e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tu( +// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32mf2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tu( +// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tu( +// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tu( +// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tu( +// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32mf2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tum( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32mf2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tumu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32mf2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_mu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c new file mode 100644 index 0000000000000..aec0b9f934ab9 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c @@ -0,0 +1,248 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tu( +// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tu( +// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tu( +// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tu( +// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tu( +// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tu( +// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tum( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tum( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tumu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tumu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_mu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_mu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m8_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c new file mode 100644 index 0000000000000..b6870264251cc --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c @@ -0,0 +1,448 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexpa -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tu( +// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tu( +// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tu( +// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tu( +// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tu( +// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tu( +// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tu( +// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tu( +// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tu( +// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tu( +// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tu( +// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tum( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tum( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tum( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tumu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tumu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tumu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_mu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_mu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_mu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c new file mode 100644 index 0000000000000..8638dc232cf01 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c @@ -0,0 +1,167 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \ +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tu( +// CHECK-RV64-SAME: <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tu( +// CHECK-RV64-SAME: <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tu( +// CHECK-RV64-SAME: <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tu( +// CHECK-RV64-SAME: <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c new file mode 100644 index 0000000000000..4ceeb7b35629c --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c @@ -0,0 +1,261 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tu( +// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tu( +// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tu( +// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tu( +// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tu( +// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tum( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tum( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tumu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tumu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_mu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_mu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c new file mode 100644 index 0000000000000..e08d6c5b371cc --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c @@ -0,0 +1,228 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +xsfvfexp32e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tu( +// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tu( +// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tu( +// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tu( +// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tu( +// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tum( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tumu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_mu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c new file mode 100644 index 0000000000000..14570d465bea8 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c @@ -0,0 +1,272 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tu( +// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tu( +// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tu( +// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tu( +// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tu( +// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tu( +// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, + vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, + vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, + vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, + vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tum( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, + vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tum( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, + vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, + vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, + vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, + vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, + vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tumu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, + vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tumu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, + vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x bfloat> [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, + vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x bfloat> [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, + vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x bfloat> [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, + vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x bfloat> [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, + vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_mu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 16 x bfloat> [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, + vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_mu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 32 x bfloat> [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, + vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c new file mode 100644 index 0000000000000..4ac5cfc360551 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c @@ -0,0 +1,492 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexpa -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tu( +// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tu( +// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tu( +// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tu( +// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tu( +// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tu( +// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tu( +// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tu( +// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tu( +// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tu( +// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tu( +// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tum( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tum( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tum( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tumu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tumu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tumu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x half> [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x half> [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x half> [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x half> [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_mu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 16 x half> [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_mu( +// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 32 x half> [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x float> [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x float> [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x float> [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x float> [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_mu( +// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 16 x float> [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c new file mode 100644 index 0000000000000..d0faaee571122 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c @@ -0,0 +1,183 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \ +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include <sifive_vector.h> + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tu( +// CHECK-RV64-SAME: <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tu( +// CHECK-RV64-SAME: <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tu( +// CHECK-RV64-SAME: <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tu( +// CHECK-RV64-SAME: <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tum( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd, + vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tum( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd, + vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tum( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd, + vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tum( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd, + vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tumu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd, + vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tumu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd, + vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tumu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd, + vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tumu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd, + vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_mu( +// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 1 x double> [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd, + vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_mu( +// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 2 x double> [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd, + vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_mu( +// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 4 x double> [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd, + vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_mu( +// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret <vscale x 8 x double> [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd, + vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} From 5f0169939165c308f26585211860153aa8f0cef3 Mon Sep 17 00:00:00 2001 From: Mend Renovate <bot@renovateapp.com> Date: Mon, 3 Nov 2025 17:05:45 +0000 Subject: [PATCH 038/313] Update [Github] Update GHA Dependencies (#166111) This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [EnricoMi/publish-unit-test-result-action](https://redirect.github.com/EnricoMi/publish-unit-test-result-action) | action | minor | `v2.20.0` -> `v2.21.0` | | [actions/upload-artifact](https://redirect.github.com/actions/upload-artifact) | action | patch | `v4.6.0` -> `v4.6.2` | | [docker/login-action](https://redirect.github.com/docker/login-action) | action | minor | `v3.5.0` -> `v3.6.0` | | [github/codeql-action](https://redirect.github.com/github/codeql-action) | action | minor | `v3.30.4` -> `v3.31.2` | | llvm/actions | action | digest | `a1ea791` -> `42d8057` | | [ossf/scorecard-action](https://redirect.github.com/ossf/scorecard-action) | action | patch | `v2.4.2` -> `v2.4.3` | | [python](https://redirect.github.com/actions/python-versions) | uses-with | minor | `3.13` -> `3.14` | --- .github/workflows/build-ci-container-tooling.yml | 2 +- .github/workflows/check-ci.yml | 2 +- .github/workflows/docs.yml | 2 +- .github/workflows/gha-codeql.yml | 4 ++-- .github/workflows/hlsl-test-all.yaml | 2 +- .github/workflows/libcxx-build-containers.yml | 2 +- .github/workflows/libcxx-run-benchmarks.yml | 2 +- .github/workflows/release-binaries.yml | 4 ++-- .github/workflows/scorecard.yml | 2 +- 9 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build-ci-container-tooling.yml b/.github/workflows/build-ci-container-tooling.yml index c77c78617666d..992947eb2fffb 100644 --- a/.github/workflows/build-ci-container-tooling.yml +++ b/.github/workflows/build-ci-container-tooling.yml @@ -63,7 +63,7 @@ jobs: podman save ${{ steps.vars.outputs.container-name-lint-tag }} > ${{ steps.vars.outputs.container-lint-filename }} - name: Upload container image - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: container-amd64 path: "*.tar" diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml index f18a69c192ee9..6ecad5536109b 100644 --- a/.github/workflows/check-ci.yml +++ b/.github/workflows/check-ci.yml @@ -28,7 +28,7 @@ jobs: - name: Setup Python uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: 3.13 + python-version: 3.14 cache: 'pip' - name: Install Python Dependencies run: | diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b5f3413fe3b6b..7374777cb759c 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -97,7 +97,7 @@ jobs: - name: Setup Python env uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: '3.13' + python-version: '3.14' cache: 'pip' cache-dependency-path: 'llvm/docs/requirements-hashed.txt' - name: Install python dependencies diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml index 63388ebc706bd..6d490ca2c4b29 100644 --- a/.github/workflows/gha-codeql.yml +++ b/.github/workflows/gha-codeql.yml @@ -29,9 +29,9 @@ jobs: sparse-checkout: | .github/ - name: Initialize CodeQL - uses: github/codeql-action/init@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4 + uses: github/codeql-action/init@5d5cd550d3e189c569da8f16ea8de2d821c9bf7a # v3.31.2 with: languages: actions queries: security-extended - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4 + uses: github/codeql-action/analyze@5d5cd550d3e189c569da8f16ea8de2d821c9bf7a # v3.31.2 diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml index dcb852312d41a..cdc951658b4d2 100644 --- a/.github/workflows/hlsl-test-all.yaml +++ b/.github/workflows/hlsl-test-all.yaml @@ -80,7 +80,7 @@ jobs: ninja check-hlsl-unit ninja ${{ inputs.TestTarget }} - name: Publish Test Results - uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0 + uses: EnricoMi/publish-unit-test-result-action/macos@34d7c956a59aed1bfebf31df77b8de55db9bbaaf # v2.21.0 if: always() && runner.os == 'macOS' with: comment_mode: off diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml index 312cb47fc3d93..4bce86145fc0c 100644 --- a/.github/workflows/libcxx-build-containers.yml +++ b/.github/workflows/libcxx-build-containers.yml @@ -55,7 +55,7 @@ jobs: TAG: ${{ github.sha }} - name: Log in to GitHub Container Registry - uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0 + uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0 with: registry: ghcr.io username: ${{ github.actor }} diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml index 9e8f55859fc7a..e2ca940d2f0b3 100644 --- a/.github/workflows/libcxx-run-benchmarks.yml +++ b/.github/workflows/libcxx-run-benchmarks.yml @@ -35,7 +35,7 @@ jobs: steps: - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: - python-version: '3.13' + python-version: '3.14' - name: Extract information from the PR id: vars diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index fa73b9d9fe8d0..d1a017ab7b553 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -68,7 +68,7 @@ jobs: # due to https://github.com/actions/runner-images/issues/10385 - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: - python-version: '3.13' + python-version: '3.14' - name: Checkout LLVM uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -173,7 +173,7 @@ jobs: ref: ${{ needs.prepare.outputs.ref }} - name: Install Ninja - uses: llvm/actions/install-ninja@a1ea791b03c8e61f53a0e66f2f73db283aa0f01e # main + uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - name: Setup Windows if: startsWith(runner.os, 'Windows') diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index c07df338cf989..bd3277a8b452c 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -36,7 +36,7 @@ jobs: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2 + uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3 with: results_file: results.sarif results_format: sarif From 97660c109470aeab78d9602328b7dc7dbb0f9091 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko <maks@fb.com> Date: Mon, 3 Nov 2025 09:19:33 -0800 Subject: [PATCH 039/313] [BOLT] Issue error on unclaimed PC-relative relocation (#166098) Replace assert with an error and improve the report when unclaimed PC-relative relocation is left in strict mode. --- bolt/lib/Core/BinaryContext.cpp | 18 +++++++++++------- bolt/test/X86/unclaimed-pc-rel.s | 24 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 7 deletions(-) create mode 100644 bolt/test/X86/unclaimed-pc-rel.s diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index a383ced1712e3..c7cd034a30410 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -778,13 +778,17 @@ void BinaryContext::populateJumpTables() { } if (opts::StrictMode && DataPCRelocations.size()) { - LLVM_DEBUG({ - dbgs() << DataPCRelocations.size() - << " unclaimed PC-relative relocations left in data:\n"; - for (uint64_t Reloc : DataPCRelocations) - dbgs() << Twine::utohexstr(Reloc) << '\n'; - }); - assert(0 && "unclaimed PC-relative relocations left in data\n"); + this->errs() << "BOLT-ERROR: " << DataPCRelocations.size() + << " unclaimed PC-relative relocation(s) left in data"; + if (opts::Verbosity) { + this->errs() << ":\n"; + for (uint64_t RelocOffset : DataPCRelocations) + this->errs() << " @0x" << Twine::utohexstr(RelocOffset) << '\n'; + } else { + this->errs() << ". Re-run with -v=1 to see the list\n"; + } + this->errs() << "BOLT-ERROR: unable to proceed with --strict\n"; + exit(1); } clearList(DataPCRelocations); } diff --git a/bolt/test/X86/unclaimed-pc-rel.s b/bolt/test/X86/unclaimed-pc-rel.s new file mode 100644 index 0000000000000..5292cccba754d --- /dev/null +++ b/bolt/test/X86/unclaimed-pc-rel.s @@ -0,0 +1,24 @@ +## Check that unclaimed PC-relative relocation from data to code is detected +## and reported to the user. + +# REQUIRES: system-linux + +# RUN: %clang %cflags -no-pie %s -o %t.exe -Wl,-q -nostartfiles +# RUN: not llvm-bolt %t.exe -o %t.bolt --strict 2>&1 | FileCheck %s + +# CHECK: BOLT-ERROR: 1 unclaimed PC-relative relocation(s) left in data + + .text + .globl _start + .type _start, %function +_start: + movl $42, %eax +.L0: + ret + .size _start, .-_start + +## Force relocation mode. + .reloc 0, R_X86_64_NONE + + .section .rodata + .long .L0-. From a85ecfa7a8ca20bd7710ab55e88ff7a60c63a5a3 Mon Sep 17 00:00:00 2001 From: Amr Hesham <amr96@programmer.net> Date: Mon, 3 Nov 2025 18:19:44 +0100 Subject: [PATCH 040/313] [clang][NFC] Fix typos in clang release notes (#166064) Fix typos in clang release notes --- clang/docs/ReleaseNotes.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6959e61cac980..c3740a4a027bd 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -355,7 +355,7 @@ Improvements to Clang's diagnostics potential misaligned members get processed before they can get discarded. (#GH144729) -- Clang now emits dignostic with correct message in case of assigning to const reference captured in lambda. (#GH105647) +- Clang now emits a diagnostic with the correct message in case of assigning to const reference captured in lambda. (#GH105647) - Fixed false positive in ``-Wmissing-noreturn`` diagnostic when it was requiring the usage of ``[[noreturn]]`` on lambdas before C++23 (#GH154493). @@ -471,7 +471,7 @@ Bug Fixes to C++ Support casts that are guaranteed to fail (#GH137518). - Fix bug rejecting partial specialization of variable templates with auto NTTPs (#GH118190). - Fix a crash if errors "member of anonymous [...] redeclares" and - "intializing multiple members of union" coincide (#GH149985). + "initializing multiple members of union" coincide (#GH149985). - Fix a crash when using ``explicit(bool)`` in pre-C++11 language modes. (#GH152729) - Fix the parsing of variadic member functions when the ellipis immediately follows a default argument.(#GH153445) - Fixed a bug that caused ``this`` captured by value in a lambda with a dependent explicit object parameter to not be From 6a275de13f6cdf927cf3cd6125ea858ddc0c4b1d Mon Sep 17 00:00:00 2001 From: Matthew Nagy <matthew.nagy@sony.com> Date: Mon, 3 Nov 2025 17:24:55 +0000 Subject: [PATCH 041/313] =?UTF-8?q?Revert=20"[UBSan]=20Improve=20error=20m?= =?UTF-8?q?essage=20when=20a=20misalignment=20is=20due=20to=20t=E2=80=A6?= =?UTF-8?q?=20(#166197)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …arget de…" This reverts commit 47c54d55c9fac5ea7c87881e00f96e8c12b18174. --- clang/lib/CodeGen/CGExprCXX.cpp | 21 +++-------- clang/lib/CodeGen/CodeGenFunction.h | 5 +-- compiler-rt/lib/ubsan/ubsan_checks.inc | 1 - compiler-rt/lib/ubsan/ubsan_handlers.cpp | 33 ++++------------- .../TestCases/TypeCheck/minimum-alignment.cpp | 36 ------------------- .../ubsan/TestCases/TypeCheck/misaligned.cpp | 2 +- 6 files changed, 12 insertions(+), 86 deletions(-) delete mode 100644 compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index f2dd22e9bed3b..14d8db32bafc6 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -18,9 +18,6 @@ #include "ConstantEmitter.h" #include "TargetInfo.h" #include "clang/Basic/CodeGenOptions.h" -#include "clang/Basic/Sanitizers.h" -#include "clang/Basic/SourceLocation.h" -#include "clang/Basic/SourceManager.h" #include "clang/CodeGen/CGFunctionInfo.h" #include "llvm/IR/Intrinsics.h" @@ -1752,17 +1749,6 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) { allocator->isReservedGlobalPlacementOperator()) result = Builder.CreateLaunderInvariantGroup(result); - // Check the default alignment of the type and why. Users may incorrectly - // return misaligned memory from a replaced operator new without knowing - // about default alignment. - TypeCheckKind checkKind = CodeGenFunction::TCK_ConstructorCall; - const TargetInfo &TI = getContext().getTargetInfo(); - unsigned DefaultTargetAlignment = TI.getNewAlign() / TI.getCharWidth(); - if (SanOpts.has(SanitizerKind::Alignment) && - (DefaultTargetAlignment > - CGM.getContext().getTypeAlignInChars(allocType).getQuantity())) - checkKind = CodeGenFunction::TCK_ConstructorCallMinimumAlign; - // Emit sanitizer checks for pointer value now, so that in the case of an // array it was checked only once and not at each constructor call. We may // have already checked that the pointer is non-null. @@ -1770,9 +1756,10 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) { // we'll null check the wrong pointer here. SanitizerSet SkippedChecks; SkippedChecks.set(SanitizerKind::Null, nullCheck); - EmitTypeCheck( - checkKind, E->getAllocatedTypeSourceInfo()->getTypeLoc().getBeginLoc(), - result, allocType, result.getAlignment(), SkippedChecks, numElements); + EmitTypeCheck(CodeGenFunction::TCK_ConstructorCall, + E->getAllocatedTypeSourceInfo()->getTypeLoc().getBeginLoc(), + result, allocType, result.getAlignment(), SkippedChecks, + numElements); EmitNewInitializer(*this, E, allocType, elementTy, result, numElements, allocSizeWithoutCookie); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 047ca844c79de..8c4c1c8c2dc95 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3296,10 +3296,7 @@ class CodeGenFunction : public CodeGenTypeCache { TCK_NonnullAssign, /// Checking the operand of a dynamic_cast or a typeid expression. Must be /// null or an object within its lifetime. - TCK_DynamicOperation, - /// Checking the 'this' poiner for a constructor call, including that the - /// alignment is greater or equal to the targets minimum alignment - TCK_ConstructorCallMinimumAlign + TCK_DynamicOperation }; /// Determine whether the pointer type check \p TCK permits null pointers. diff --git a/compiler-rt/lib/ubsan/ubsan_checks.inc b/compiler-rt/lib/ubsan/ubsan_checks.inc index f8757d781afb8..b1d09a9024e7e 100644 --- a/compiler-rt/lib/ubsan/ubsan_checks.inc +++ b/compiler-rt/lib/ubsan/ubsan_checks.inc @@ -28,7 +28,6 @@ UBSAN_CHECK(NullptrAfterNonZeroOffset, "nullptr-after-nonzero-offset", UBSAN_CHECK(PointerOverflow, "pointer-overflow", "pointer-overflow") UBSAN_CHECK(MisalignedPointerUse, "misaligned-pointer-use", "alignment") UBSAN_CHECK(AlignmentAssumption, "alignment-assumption", "alignment") -UBSAN_CHECK(MinumumAssumedAlignment, "minimum-assumed-alignment", "alignment") UBSAN_CHECK(InsufficientObjectSize, "insufficient-object-size", "object-size") UBSAN_CHECK(SignedIntegerOverflow, "signed-integer-overflow", "signed-integer-overflow") diff --git a/compiler-rt/lib/ubsan/ubsan_handlers.cpp b/compiler-rt/lib/ubsan/ubsan_handlers.cpp index fc6063af4562b..63319f46734a4 100644 --- a/compiler-rt/lib/ubsan/ubsan_handlers.cpp +++ b/compiler-rt/lib/ubsan/ubsan_handlers.cpp @@ -73,26 +73,14 @@ enum TypeCheckKind { TCK_NonnullAssign, /// Checking the operand of a dynamic_cast or a typeid expression. Must be /// null or an object within its lifetime. - TCK_DynamicOperation, - /// Checking the 'this' poiner for a constructor call, including that the - /// alignment is greater or equal to the targets minimum alignment - TCK_ConstructorCallMinimumAlign + TCK_DynamicOperation }; extern const char *const TypeCheckKinds[] = { - "load of", - "store to", - "reference binding to", - "member access within", - "member call on", - "constructor call on", - "downcast of", - "downcast of", - "upcast of", - "cast to virtual base of", - "_Nonnull binding to", - "dynamic operation on", - "constructor call with pointer from operator new on"}; + "load of", "store to", "reference binding to", "member access within", + "member call on", "constructor call on", "downcast of", "downcast of", + "upcast of", "cast to virtual base of", "_Nonnull binding to", + "dynamic operation on"}; } static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer, @@ -106,9 +94,7 @@ static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer, ? ErrorType::NullPointerUseWithNullability : ErrorType::NullPointerUse; else if (Pointer & (Alignment - 1)) - ET = (Data->TypeCheckKind == TCK_ConstructorCallMinimumAlign) - ? ErrorType::MinumumAssumedAlignment - : ErrorType::MisalignedPointerUse; + ET = ErrorType::MisalignedPointerUse; else ET = ErrorType::InsufficientObjectSize; @@ -131,13 +117,6 @@ static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer, Diag(Loc, DL_Error, ET, "%0 null pointer of type %1") << TypeCheckKinds[Data->TypeCheckKind] << Data->Type; break; - case ErrorType::MinumumAssumedAlignment: - Diag(Loc, DL_Error, ET, - "%0 misaligned address %1 for type %2, " - "which requires target minimum assumed %3 byte alignment") - << TypeCheckKinds[Data->TypeCheckKind] << (void *)Pointer << Data->Type - << Alignment; - break; case ErrorType::MisalignedPointerUse: Diag(Loc, DL_Error, ET, "%0 misaligned address %1 for type %3, " "which requires %2 byte alignment") diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp deleted file mode 100644 index 4642126ab74c4..0000000000000 --- a/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// RUN: %clangxx %gmlt -fsanitize=alignment %s -o %t -// RUN: %run %t 2>&1 | FileCheck %s - -// UNSUPPORTED: i386 -// UNSUPPORTED: armv7l - -// These sanitizers already overload the new operator so won't compile this test -// UNSUPPORTED: ubsan-msan -// UNSUPPORTED: ubsan-tsan - -#include <cassert> -#include <cstdlib> - -void *operator new(std::size_t count) { - constexpr const size_t offset = 8; - - // allocate a bit more so we can safely offset it - void *ptr = std::malloc(count + offset); - - // verify malloc returned 16 bytes aligned mem - static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ == 16); - assert(((std::ptrdiff_t)ptr & (__STDCPP_DEFAULT_NEW_ALIGNMENT__ - 1)) == 0); - - return (char *)ptr + offset; -} - -struct Foo { - void *_cookie1, *_cookie2; -}; - -static_assert(alignof(Foo) == 8); -int main() { - // CHECK: runtime error: constructor call with pointer from operator new on misaligned address 0x{{.*}} for type 'Foo', which requires target minimum assumed 16 byte alignment - Foo *f = new Foo; - return 0; -} diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp index 4b0b2b5923c6f..e39a0ab4e6589 100644 --- a/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp +++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp @@ -101,7 +101,7 @@ int main(int, char **argv) { return s->f() && 0; case 'n': - // CHECK-NEW: misaligned.cpp:[[@LINE+4]]{{(:21)?}}: runtime error: constructor call with pointer from operator new on misaligned address [[PTR:0x[0-9a-f]*]] for type 'S', which requires target minimum assumed 4 byte alignment + // CHECK-NEW: misaligned.cpp:[[@LINE+4]]{{(:21)?}}: runtime error: constructor call on misaligned address [[PTR:0x[0-9a-f]*]] for type 'S', which requires 4 byte alignment // CHECK-NEW-NEXT: [[PTR]]: note: pointer points here // CHECK-NEW-NEXT: {{^ 00 00 00 01 02 03 04 05}} // CHECK-NEW-NEXT: {{^ \^}} From 513334faec2594bbeb3ac00f0092bed20b23abd3 Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.voicu@amd.com> Date: Mon, 3 Nov 2025 19:30:04 +0200 Subject: [PATCH 042/313] [NFC][SPIRV] Fix function type recovery (#165934) Due to limitations in GISel / IRTranslator, the SPIR-V BE replaces aggregate function args with `i32` placeholders, which are subsequently used to retrieve the original type after IR translation, from metadata. Due to what appears to be an oversight, the current implementation only handles a single mutation, as it does not traverse the metadata, but rather only takes the first operand. This patch addresses that limitation by correctly iterating the metadata. --- llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp | 30 +++++++++---------- .../pointers/composite-fun-fix-ptr-arg.ll | 14 +++++++++ 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 9e11c3a281a1b..dd57b74d79a5e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -149,23 +149,23 @@ static FunctionType *getOriginalFunctionType(const Function &F) { return isa<MDString>(N->getOperand(0)) && cast<MDString>(N->getOperand(0))->getString() == F.getName(); }); - // TODO: probably one function can have numerous type mutations, - // so we should support this. if (ThisFuncMDIt != NamedMD->op_end()) { auto *ThisFuncMD = *ThisFuncMDIt; - MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1)); - assert(MD && "MDNode operand is expected"); - ConstantInt *Const = getConstInt(MD, 0); - if (Const) { - auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); - assert(CMeta && "ConstantAsMetadata operand is expected"); - assert(Const->getSExtValue() >= -1); - // Currently -1 indicates return value, greater values mean - // argument numbers. - if (Const->getSExtValue() == -1) - RetTy = CMeta->getType(); - else - ArgTypes[Const->getSExtValue()] = CMeta->getType(); + for (unsigned I = 1; I != ThisFuncMD->getNumOperands(); ++I) { + MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(I)); + assert(MD && "MDNode operand is expected"); + ConstantInt *Const = getConstInt(MD, 0); + if (Const) { + auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); + assert(CMeta && "ConstantAsMetadata operand is expected"); + assert(Const->getSExtValue() >= -1); + // Currently -1 indicates return value, greater values mean + // argument numbers. + if (Const->getSExtValue() == -1) + RetTy = CMeta->getType(); + else + ArgTypes[Const->getSExtValue()] = CMeta->getType(); + } } } diff --git a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll index 73c46b18bfa78..c9b2968a4aed7 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll @@ -10,6 +10,7 @@ ; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0 ; CHECK-DAG: %[[#Half:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#Float:]] = OpTypeFloat 32 ; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#Half]] ; CHECK-DAG: %[[#Void:]] = OpTypeVoid ; CHECK-DAG: %[[#PtrInt8:]] = OpTypePointer CrossWorkgroup %[[#Int8:]] @@ -17,12 +18,20 @@ ; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0 ; CHECK-DAG: %[[#PtrInt64:]] = OpTypePointer CrossWorkgroup %[[#Int64]] ; CHECK-DAG: %[[#BarType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt64]] %[[#Struct]] +; CHECK-DAG: %[[#BazType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt8]] %[[#Struct]] %[[#Int8]] %[[#Struct]] %[[#Float]] %[[#Struct]] ; CHECK: OpFunction %[[#Void]] None %[[#FooType]] ; CHECK: OpFunctionParameter %[[#PtrInt8]] ; CHECK: OpFunctionParameter %[[#Struct]] ; CHECK: OpFunction %[[#Void]] None %[[#BarType]] ; CHECK: OpFunctionParameter %[[#PtrInt64]] ; CHECK: OpFunctionParameter %[[#Struct]] +; CHECK: OpFunction %[[#Void]] None %[[#BazType]] +; CHECK: OpFunctionParameter %[[#PtrInt8]] +; CHECK: OpFunctionParameter %[[#Struct]] +; CHECK: OpFunctionParameter %[[#Int8]] +; CHECK: OpFunctionParameter %[[#Struct]] +; CHECK: OpFunctionParameter %[[#Float]] +; CHECK: OpFunctionParameter %[[#Struct]] %t_half = type { half } @@ -38,4 +47,9 @@ entry: ret void } +define spir_kernel void @baz(ptr addrspace(1) %a, %t_half %b, i8 %c, %t_half %d, float %e, %t_half %f) { +entry: + ret void +} + declare spir_func %t_half @_Z29__spirv_SpecConstantComposite(half) From c4763e2b9038fbd1154f0276a8b9542b8c115111 Mon Sep 17 00:00:00 2001 From: Alan Zhao <ayzhao@google.com> Date: Mon, 3 Nov 2025 09:32:42 -0800 Subject: [PATCH 043/313] [profcheck][InstCombine] Preserve branch weights in logical identities (#165810) For the simplification ``` (C && A) || (!C && B) --> sel C, A, B ``` (and related), if `C` (or (`!C`)) is the condition in the select instruction representing the logical and, we can preserve that logical and's branch weights when emitting the new instruction. Otherwise, the profile data is unknown. If `C` is the condition of both logical ands, then we just take the branch weights of the first logical and (though in practice they should be equal.) Furthermore, `select-safe-transforms.ii` now passes under the profcheck configuration, so we remove it from the failing tests. Tracking issue: #147390 --- .../InstCombine/InstCombineSelect.cpp | 29 +++++++++++ .../InstCombine/select-safe-transforms.ll | 51 ++++++++++--------- llvm/utils/profcheck-xfail.txt | 1 - 3 files changed, 56 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f5130da818746..9572f9d702e1b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelCond->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) && + SelCond) { + return SelectInst::Create(C, A, B, "", nullptr, SelCond); + } else if (match(FalseVal, + m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) && + SelFVal) { + SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal); + NewSI->swapProfMetadata(); + return NewSI; + } else { + return createSelectInstWithUnknownProfile(C, A, B); + } + } return SelectInst::Create(C, A, B); } @@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelFVal->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) && + SelCond) { + SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond); + NewSI->swapProfMetadata(); + return NewSI; + } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) && + SelFVal) { + return SelectInst::Create(C, B, A, "", nullptr, SelFVal); + } else { + return createSelectInstWithUnknownProfile(C, B, A); + } + } return SelectInst::Create(C, B, A); } } diff --git a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll index 3d97048f43127..8b3c0502ac04d 100644 --- a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll +++ b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll @@ -256,27 +256,27 @@ define <2 x i1> @not_logical_or2(i1 %b, <2 x i32> %a) { ret <2 x i1> %and } -define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) { +define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools_logical_commute0( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF2]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 - %and1 = select i1 %not, i1 %a, i1 false - %and2 = select i1 %c, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and1 = select i1 %not, i1 %a, i1 false, !prof!1 + %and2 = select i1 %c, i1 %b, i1 false, !prof !2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !3 ret i1 %or } -define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) { +define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools_logical_commute0_and1( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF1]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %not, %a - %and2 = select i1 %c, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and2 = select i1 %c, i1 %b, i1 false, !prof !1 + %or = select i1 %and1, i1 true, i1 %and2, !prof !2 ret i1 %or } @@ -292,15 +292,15 @@ define i1 @bools_logical_commute0_and2(i1 %a, i1 %b, i1 %c) { ret i1 %or } -define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) { +define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools_logical_commute0_and1_and2( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF3:![0-9]+]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %not, %a %and2 = and i1 %c, %b - %or = select i1 %and1, i1 true, i1 %and2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !1 ret i1 %or } @@ -457,27 +457,27 @@ define i1 @bools_logical_commute3_and1_and2(i1 %b, i1 %c) { ret i1 %or } -define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) { +define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools2_logical_commute0( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF1]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 - %and1 = select i1 %c, i1 %a, i1 false - %and2 = select i1 %not, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and1 = select i1 %c, i1 %a, i1 false, !prof !1 + %and2 = select i1 %not, i1 %b, i1 false, !prof !2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !3 ret i1 %or } -define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) { +define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools2_logical_commute0_and1( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF2]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %c, %a - %and2 = select i1 %not, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and2 = select i1 %not, i1 %b, i1 false, !prof !1 + %or = select i1 %and1, i1 true, i1 %and2, !prof !2 ret i1 %or } @@ -493,15 +493,15 @@ define i1 @bools2_logical_commute0_and2(i1 %a, i1 %b, i1 %c) { ret i1 %or } -define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) { +define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools2_logical_commute0_and1_and2( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF3]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %c, %a %and2 = and i1 %not, %b - %or = select i1 %and1, i1 true, i1 %and2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !1 ret i1 %or } @@ -799,8 +799,11 @@ define <2 x i1> @not_logical_and2(i1 %b, <2 x i32> %a) { !0 = !{!"function_entry_count", i64 1000} !1 = !{!"branch_weights", i32 2, i32 3} +!2 = !{!"branch_weights", i32 5, i32 7} +!3 = !{!"branch_weights", i32 11, i32 13} ;. ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} ; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3} ; CHECK: [[PROF2]] = !{!"branch_weights", i32 3, i32 2} +; CHECK: [[PROF3]] = !{!"unknown", !"instcombine"} ;. diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index 380b162d8c58c..661c88125c9c8 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -917,7 +917,6 @@ Transforms/InstCombine/select_frexp.ll Transforms/InstCombine/select.ll Transforms/InstCombine/select-min-max.ll Transforms/InstCombine/select-of-symmetric-selects.ll -Transforms/InstCombine/select-safe-transforms.ll Transforms/InstCombine/select-select.ll Transforms/InstCombine/select-with-extreme-eq-cond.ll Transforms/InstCombine/shift.ll From e8765401d49cf0b5775391f2eaa8e44a292e2caf Mon Sep 17 00:00:00 2001 From: vangthao95 <vang.thao@amd.com> Date: Mon, 3 Nov 2025 09:36:49 -0800 Subject: [PATCH 044/313] [AMDGPU][GlobalISel] Add RegBankLegalize support for G_FENCE (#165939) --- .../Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp | 2 ++ .../GlobalISel/memory-legalizer-atomic-fence.ll | 14 +++++++------- llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index dd474ac52c3c8..1e5885a25c195 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -913,6 +913,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); + addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}}); + addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) .Uni(S64, {{Sgpr64}, {}}); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index e86f7473363f7..37b5422be7e2f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s ; Note: we use MIR test checks + stop after legalizer to prevent ; tests from being optimized out. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll index 44b12a9f6fe81..61a61376d7ddd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s declare void @readsMem(ptr) #0 declare void @writesMem(ptr) #1 From 6747ea050dfc917b78c205102d9545902140ea2d Mon Sep 17 00:00:00 2001 From: Steven Wu <stevenwu@apple.com> Date: Mon, 3 Nov 2025 09:50:28 -0800 Subject: [PATCH 045/313] [CAS] Add UnifiedOnDiskCache and OnDiskCAS (#114103) Add a new abstraction layer UnifiedOnDiskCache that adds new functions of disk space management and data validation that builds on top of OnDiskGraphDB and OnDiskKeyValueDB. Build upon UnifiedOnDiskCache, it is OnDiskCAS that implements ObjectStore and ActionCache interface for LLVM tools to interact with CAS storage. --- llvm/include/llvm/CAS/ActionCache.h | 6 + .../llvm/CAS/BuiltinUnifiedCASDatabases.h | 59 ++ llvm/include/llvm/CAS/ObjectStore.h | 47 +- llvm/include/llvm/CAS/OnDiskGraphDB.h | 12 +- llvm/include/llvm/CAS/OnDiskKeyValueDB.h | 15 +- llvm/include/llvm/CAS/UnifiedOnDiskCache.h | 172 +++++ llvm/lib/CAS/ActionCaches.cpp | 166 +++++ llvm/lib/CAS/BuiltinCAS.cpp | 14 +- llvm/lib/CAS/BuiltinCAS.h | 25 +- llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp | 38 ++ llvm/lib/CAS/CMakeLists.txt | 3 + llvm/lib/CAS/InMemoryCAS.cpp | 8 + llvm/lib/CAS/ObjectStore.cpp | 93 ++- llvm/lib/CAS/OnDiskCAS.cpp | 211 ++++++ llvm/lib/CAS/OnDiskGraphDB.cpp | 34 +- llvm/lib/CAS/OnDiskKeyValueDB.cpp | 21 +- llvm/lib/CAS/UnifiedOnDiskCache.cpp | 613 ++++++++++++++++++ llvm/unittests/CAS/ActionCacheTest.cpp | 6 +- .../CAS/BuiltinUnifiedCASDatabasesTest.cpp | 67 ++ llvm/unittests/CAS/CASTestConfig.cpp | 17 +- llvm/unittests/CAS/CASTestConfig.h | 41 +- llvm/unittests/CAS/CMakeLists.txt | 2 + llvm/unittests/CAS/ObjectStoreTest.cpp | 134 +++- llvm/unittests/CAS/OnDiskCommonUtils.h | 21 + llvm/unittests/CAS/OnDiskGraphDBTest.cpp | 12 +- llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp | 198 ++++++ 26 files changed, 1970 insertions(+), 65 deletions(-) create mode 100644 llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h create mode 100644 llvm/include/llvm/CAS/UnifiedOnDiskCache.h create mode 100644 llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp create mode 100644 llvm/lib/CAS/OnDiskCAS.cpp create mode 100644 llvm/lib/CAS/UnifiedOnDiskCache.cpp create mode 100644 llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp create mode 100644 llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h index 69ee4dde1974a..7f5b11223c54d 100644 --- a/llvm/include/llvm/CAS/ActionCache.h +++ b/llvm/include/llvm/CAS/ActionCache.h @@ -75,6 +75,9 @@ class ActionCache { CanBeDistributed); } + /// Validate the ActionCache contents. + virtual Error validate() const = 0; + virtual ~ActionCache() = default; protected: @@ -97,6 +100,9 @@ class ActionCache { /// Create an action cache in memory. std::unique_ptr<ActionCache> createInMemoryActionCache(); +/// Create an action cache on disk. +Expected<std::unique_ptr<ActionCache>> createOnDiskActionCache(StringRef Path); + } // end namespace llvm::cas #endif // LLVM_CAS_ACTIONCACHE_H diff --git a/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h new file mode 100644 index 0000000000000..6c165c421b168 --- /dev/null +++ b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H +#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H + +#include "llvm/Support/Error.h" + +namespace llvm::cas { + +class ActionCache; +class ObjectStore; + +/// Create on-disk \c ObjectStore and \c ActionCache instances based on +/// \c ondisk::UnifiedOnDiskCache, with built-in hashing. +Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>> +createOnDiskUnifiedCASDatabases(StringRef Path); + +/// Represents the result of validating the contents using +/// \c validateOnDiskUnifiedCASDatabasesIfNeeded. +/// +/// Note: invalid results are handled as an \c Error. +enum class ValidationResult { + /// The data is already valid. + Valid, + /// The data was invalid, but was recovered. + Recovered, + /// Validation was skipped, as it was not needed. + Skipped, +}; + +/// Validate the data in \p Path, if needed to ensure correctness. +/// +/// \param Path directory for the on-disk database. +/// \param CheckHash Whether to validate hashes match the data. +/// \param AllowRecovery Whether to automatically recover from invalid data by +/// marking the files for garbage collection. +/// \param ForceValidation Whether to force validation to occur even if it +/// should not be necessary. +/// \param LLVMCasBinaryPath If provided, validation is performed out-of-process +/// using the given \c llvm-cas executable which protects against crashes +/// during validation. Otherwise validation is performed in-process. +/// +/// \returns \c Valid if the data is already valid, \c Recovered if data +/// was invalid but has been cleared, \c Skipped if validation is not needed, +/// or an \c Error if validation cannot be performed or if the data is left +/// in an invalid state because \p AllowRecovery is false. +Expected<ValidationResult> validateOnDiskUnifiedCASDatabasesIfNeeded( + StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinaryPath); + +} // namespace llvm::cas + +#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h index 6db5dd3904095..29950fe9d9029 100644 --- a/llvm/include/llvm/CAS/ObjectStore.h +++ b/llvm/include/llvm/CAS/ObjectStore.h @@ -5,6 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of the ObjectStore class. +/// +//===----------------------------------------------------------------------===// #ifndef LLVM_CAS_OBJECTSTORE_H #define LLVM_CAS_OBJECTSTORE_H @@ -111,7 +116,10 @@ class ObjectStore { virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0; /// Validate the underlying object referred by CASID. - virtual Error validate(const CASID &ID) = 0; + virtual Error validateObject(const CASID &ID) = 0; + + /// Validate the entire ObjectStore. + virtual Error validate(bool CheckHash) const = 0; protected: /// Load the object referenced by \p Ref. @@ -215,9 +223,39 @@ class ObjectStore { return Data.size(); } + /// Set the size for limiting growth of on-disk storage. This has an effect + /// for when the instance is closed. + /// + /// Implementations may leave this unimplemented. + virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) { + return Error::success(); + } + + /// \returns the storage size of the on-disk CAS data. + /// + /// Implementations that don't have an implementation for this should return + /// \p std::nullopt. + virtual Expected<std::optional<uint64_t>> getStorageSize() const { + return std::nullopt; + } + + /// Prune local storage to reduce its size according to the desired size + /// limit. Pruning can happen concurrently with other operations. + /// + /// Implementations may leave this unimplemented. + virtual Error pruneStorageData() { return Error::success(); } + /// Validate the whole node tree. Error validateTree(ObjectRef Ref); + /// Import object from another CAS. This will import the full tree from the + /// other CAS. + Expected<ObjectRef> importObject(ObjectStore &Upstream, ObjectRef Other); + + /// Print the ObjectStore internals for debugging purpose. + virtual void print(raw_ostream &) const {} + void dump() const; + /// Get CASContext const CASContext &getContext() const { return Context; } @@ -290,8 +328,15 @@ class ObjectProxy { ObjectHandle H; }; +/// Create an in memory CAS. std::unique_ptr<ObjectStore> createInMemoryCAS(); +/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled. +bool isOnDiskCASEnabled(); + +/// Create a persistent on-disk path at \p Path. +Expected<std::unique_ptr<ObjectStore>> createOnDiskCAS(const Twine &Path); + } // namespace cas } // namespace llvm diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h index 5f0ee0e131c0f..76cc528711b69 100644 --- a/llvm/include/llvm/CAS/OnDiskGraphDB.h +++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h @@ -340,13 +340,16 @@ class OnDiskGraphDB { /// \param HashByteSize Size for the object digest hash bytes. /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes /// if they don't exist in the primary store. The upstream store is only used - /// for reading nodes, new nodes are only written to the primary store. + /// for reading nodes, new nodes are only written to the primary store. User + /// need to make sure \p UpstreamDB outlives current instance of + /// OnDiskGraphDB and the common usage is to have an \p UnifiedOnDiskCache to + /// manage both. /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied /// to primary store. This is recorded at creation time and subsequent opens /// need to pass the same policy otherwise the \p open will fail. static Expected<std::unique_ptr<OnDiskGraphDB>> open(StringRef Path, StringRef HashName, unsigned HashByteSize, - std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr, + OnDiskGraphDB *UpstreamDB = nullptr, FaultInPolicy Policy = FaultInPolicy::FullTree); ~OnDiskGraphDB(); @@ -438,8 +441,7 @@ class OnDiskGraphDB { // Private constructor. OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index, - OnDiskDataAllocator DataPool, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, + OnDiskDataAllocator DataPool, OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy); /// Mapping from hash to object reference. @@ -459,7 +461,7 @@ class OnDiskGraphDB { std::string RootPath; /// Optional on-disk store to be used for faulting-in nodes. - std::unique_ptr<OnDiskGraphDB> UpstreamDB; + OnDiskGraphDB *UpstreamDB = nullptr; /// The policy used to fault in data from upstream. FaultInPolicy FIPolicy; diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h index b762518366c21..17ae52f0307fc 100644 --- a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h +++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h @@ -19,6 +19,8 @@ namespace llvm::cas::ondisk { +class UnifiedOnDiskCache; + /// An on-disk key-value data store with the following properties: /// * Keys are fixed length binary hashes with expected normal distribution. /// * Values are buffers of the same size, specified at creation time. @@ -59,9 +61,13 @@ class OnDiskKeyValueDB { /// \param KeySize Size for the key hash bytes. /// \param ValueName Identifier name for the values. /// \param ValueSize Size for the value bytes. + /// \param UnifiedCache An optional UnifiedOnDiskCache that manages the size + /// and lifetime of the CAS instance and it must owns current initializing + /// KeyValueDB after initialized. static Expected<std::unique_ptr<OnDiskKeyValueDB>> open(StringRef Path, StringRef HashName, unsigned KeySize, - StringRef ValueName, size_t ValueSize); + StringRef ValueName, size_t ValueSize, + UnifiedOnDiskCache *UnifiedCache = nullptr); using CheckValueT = function_ref<Error(FileOffset Offset, ArrayRef<char> Data)>; @@ -70,11 +76,14 @@ class OnDiskKeyValueDB { Error validate(CheckValueT CheckValue) const; private: - OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache) - : ValueSize(ValueSize), Cache(std::move(Cache)) {} + OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache, + UnifiedOnDiskCache *UnifiedCache) + : ValueSize(ValueSize), Cache(std::move(Cache)), + UnifiedCache(UnifiedCache) {} const size_t ValueSize; OnDiskTrieRawHashMap Cache; + UnifiedOnDiskCache *UnifiedCache = nullptr; }; } // namespace llvm::cas::ondisk diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h new file mode 100644 index 0000000000000..6e0878a65fe72 --- /dev/null +++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h @@ -0,0 +1,172 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H +#define LLVM_CAS_UNIFIEDONDISKCACHE_H + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include <atomic> + +namespace llvm::cas::ondisk { + +class OnDiskKeyValueDB; + +/// A unified CAS nodes and key-value database, using on-disk storage for both. +/// It manages storage growth and provides APIs for garbage collection. +/// +/// High-level properties: +/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the +/// storage size in that directory will keep growing unrestricted. For data to +/// become eligible for garbage-collection there should be no open instances +/// of \p UnifiedOnDiskCache for that directory, by any process. +/// * Garbage-collection needs to be triggered explicitly by the client. It can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers, in the same process or other +/// processes. +/// +/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open +/// for a limited period of time, e.g. for the duration of a build operation. +/// For long-living processes that need periodic access to a +/// \p UnifiedOnDiskCache, the client should devise a scheme where access is +/// performed within some defined period. For example, if a service is designed +/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it +/// could keep the instance alive while new requests are coming in but close it +/// after a time period in which there are no new requests. +class UnifiedOnDiskCache { +public: + /// The \p OnDiskGraphDB instance for the open directory. + OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; } + + /// The \p OnDiskGraphDB instance for the open directory. + OnDiskKeyValueDB &getKeyValueDB() { return *PrimaryKVDB; } + + /// Open a \p UnifiedOnDiskCache instance for a directory. + /// + /// \param Path directory for the on-disk database. The directory will be + /// created if it doesn't exist. + /// \param SizeLimit Optional size for limiting growth. This has an effect for + /// when the instance is closed. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param HashByteSize Size for the object digest hash bytes. + /// \param FaultInPolicy Controls how nodes are copied to primary store. This + /// is recorded at creation time and subsequent opens need to pass the same + /// policy otherwise the \p open will fail. + static Expected<std::unique_ptr<UnifiedOnDiskCache>> + open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName, + unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy = + OnDiskGraphDB::FaultInPolicy::FullTree); + + /// Validate the data in \p Path, if needed to ensure correctness. + /// + /// Note: if invalid data is detected and \p AllowRecovery is true, then + /// recovery requires exclusive access to the CAS and it is an error to + /// attempt recovery if there is concurrent use of the CAS. + /// + /// \param Path directory for the on-disk database. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param HashByteSize Size for the object digest hash bytes. + /// \param CheckHash Whether to validate hashes match the data. + /// \param AllowRecovery Whether to automatically recover from invalid data by + /// marking the files for garbage collection. + /// \param ForceValidation Whether to force validation to occur even if it + /// should not be necessary. + /// \param LLVMCasBinary If provided, validation is performed out-of-process + /// using the given \c llvm-cas executable which protects against crashes + /// during validation. Otherwise validation is performed in-process. + /// + /// \returns \c Valid if the data is already valid, \c Recovered if data + /// was invalid but has been cleared, \c Skipped if validation is not needed, + /// or an \c Error if validation cannot be performed or if the data is left + /// in an invalid state because \p AllowRecovery is false. + static Expected<ValidationResult> + validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize, + bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinary); + + /// This is called implicitly at destruction time, so it is not required for a + /// client to call this. After calling \p close the only method that is valid + /// to call is \p needsGarbageCollection. + /// + /// \param CheckSizeLimit if true it will check whether the primary store has + /// exceeded its intended size limit. If false the check is skipped even if a + /// \p SizeLimit was passed to the \p open call. + Error close(bool CheckSizeLimit = true); + + /// Set the size for limiting growth. This has an effect for when the instance + /// is closed. + void setSizeLimit(std::optional<uint64_t> SizeLimit); + + /// \returns the storage size of the cache data. + uint64_t getStorageSize() const; + + /// \returns whether the primary store has exceeded the intended size limit. + /// This can return false even if the overall size of the opened directory is + /// over the \p SizeLimit passed to \p open. To know whether garbage + /// collection needs to be triggered or not, call \p needsGarbaseCollection. + bool hasExceededSizeLimit() const; + + /// \returns whether there are unused data that can be deleted using a + /// \p collectGarbage call. + bool needsGarbageCollection() const { return NeedsGarbageCollection; } + + /// Remove any unused data from the directory at \p Path. If there are no such + /// data the operation is a no-op. + /// + /// This can be called concurrently, regardless of whether there is an open + /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers + /// in the same process or other processes. + /// + /// It is recommended that garbage-collection is triggered concurrently in the + /// background, so that it has minimal effect on the workload of the process. + static Error collectGarbage(StringRef Path); + + /// Remove unused data from the current UnifiedOnDiskCache. + Error collectGarbage(); + + /// Helper function to convert the value stored in KeyValueDB and ObjectID. + static ObjectID getObjectIDFromValue(ArrayRef<char> Value); + + using ValueBytes = std::array<char, sizeof(uint64_t)>; + static ValueBytes getValueFromObjectID(ObjectID ID); + + ~UnifiedOnDiskCache(); + +private: + friend class OnDiskGraphDB; + friend class OnDiskKeyValueDB; + + UnifiedOnDiskCache(); + + Expected<std::optional<ArrayRef<char>>> + faultInFromUpstreamKV(ArrayRef<uint8_t> Key); + + /// \returns the storage size of the primary directory. + uint64_t getPrimaryStorageSize() const; + + std::string RootPath; + std::atomic<uint64_t> SizeLimit; + + int LockFD = -1; + + std::atomic<bool> NeedsGarbageCollection; + std::string PrimaryDBDir; + + std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB; + std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; + + std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; + std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; +}; + +} // namespace llvm::cas::ondisk + +#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp index 571c5b3ca5b4b..003c850275ff4 100644 --- a/llvm/lib/CAS/ActionCaches.cpp +++ b/llvm/lib/CAS/ActionCaches.cpp @@ -13,7 +13,11 @@ #include "BuiltinCAS.h" #include "llvm/ADT/TrieRawHashMap.h" #include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Config/llvm-config.h" #include "llvm/Support/BLAKE3.h" +#include "llvm/Support/Errc.h" #define DEBUG_TYPE "cas-action-caches" @@ -47,12 +51,54 @@ class InMemoryActionCache final : public ActionCache { Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, bool CanBeDistributed) const final; + Error validate() const final { + return createStringError("InMemoryActionCache doesn't support validate()"); + } + private: using DataT = CacheEntry<sizeof(HashType)>; using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>; InMemoryCacheT Cache; }; + +/// Builtin basic OnDiskActionCache that uses one underlying OnDiskKeyValueDB. +class OnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool CanBeDistributed) const final; + + static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path); + + Error validate() const final; + +private: + static StringRef getHashName() { return "BLAKE3"; } + + OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB); + + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; + using DataT = CacheEntry<sizeof(HashType)>; +}; + +/// Builtin unified ActionCache that wraps around UnifiedOnDiskCache to provide +/// access to its ActionCache. +class UnifiedOnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool CanBeDistributed) const final; + + UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + + Error validate() const final; + +private: + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; +}; } // end namespace static Error createResultCachePoisonedError(ArrayRef<uint8_t> KeyHash, @@ -99,3 +145,123 @@ std::unique_ptr<ActionCache> createInMemoryActionCache() { } } // namespace llvm::cas + +OnDiskActionCache::OnDiskActionCache( + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + DB(std::move(DB)) {} + +Expected<std::unique_ptr<OnDiskActionCache>> +OnDiskActionCache::create(StringRef AbsPath) { + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; + if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(), + sizeof(HashType), getHashName(), + sizeof(DataT)) + .moveInto(DB)) + return std::move(E); + return std::unique_ptr<OnDiskActionCache>( + new OnDiskActionCache(std::move(DB))); +} + +Expected<std::optional<CASID>> +OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, + bool /*CanBeDistributed*/) const { + std::optional<ArrayRef<char>> Val; + if (Error E = DB->get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + return CASID::create(&getContext(), toStringRef(*Val)); +} + +Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result, + bool /*CanBeDistributed*/) { + auto ResultHash = Result.getHash(); + ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size()); + ArrayRef<char> Observed; + if (Error E = DB->put(Key, Expected).moveInto(Observed)) + return E; + + if (Expected == Observed) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, + ArrayRef((const uint8_t *)Observed.data(), Observed.size())); +} + +Error OnDiskActionCache::validate() const { + // FIXME: without the matching CAS there is nothing we can check about the + // cached values. The hash size is already validated by the DB validator. + return DB->validate(nullptr); +} + +UnifiedOnDiskActionCache::UnifiedOnDiskActionCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + UniDB(std::move(UniDB)) {} + +Expected<std::optional<CASID>> +UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, + bool /*CanBeDistributed*/) const { + std::optional<ArrayRef<char>> Val; + if (Error E = UniDB->getKeyValueDB().get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Val); + return CASID::create(&getContext(), + toStringRef(UniDB->getGraphDB().getDigest(ID))); +} + +Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, + const CASID &Result, + bool /*CanBeDistributed*/) { + auto Expected = UniDB->getGraphDB().getReference(Result.getHash()); + if (LLVM_UNLIKELY(!Expected)) + return Expected.takeError(); + + auto Value = ondisk::UnifiedOnDiskCache::getValueFromObjectID(*Expected); + std::optional<ArrayRef<char>> Observed; + if (Error E = UniDB->getKeyValueDB().put(Key, Value).moveInto(Observed)) + return E; + + auto ObservedID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Observed); + if (*Expected == ObservedID) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, UniDB->getGraphDB().getDigest(ObservedID)); +} + +Error UnifiedOnDiskActionCache::validate() const { + auto ValidateRef = [](FileOffset Offset, ArrayRef<char> Value) -> Error { + auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(Value); + auto formatError = [&](Twine Msg) { + return createStringError( + llvm::errc::illegal_byte_sequence, + "bad record at 0x" + + utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " + + Msg.str()); + }; + if (ID.getOpaqueData() == 0) + return formatError("zero is not a valid ref"); + return Error::success(); + }; + return UniDB->getKeyValueDB().validate(ValidateRef); +} + +Expected<std::unique_ptr<ActionCache>> +cas::createOnDiskActionCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return OnDiskActionCache::create(Path); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} + +std::unique_ptr<ActionCache> +cas::builtin::createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { + return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp index 73646ad2c3528..e9bc6d8beed4e 100644 --- a/llvm/lib/CAS/BuiltinCAS.cpp +++ b/llvm/lib/CAS/BuiltinCAS.cpp @@ -9,6 +9,7 @@ #include "BuiltinCAS.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Process.h" using namespace llvm; @@ -68,7 +69,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs, Refs, Data); } -Error BuiltinCAS::validate(const CASID &ID) { +Error BuiltinCAS::validateObject(const CASID &ID) { auto Ref = getReference(ID); if (!Ref) return createUnknownObjectError(ID); @@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) { return Error::success(); } + +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt, + BuiltinCASContext::getHashName(), + sizeof(HashType)); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h index 3b5374d5e1850..4d2de66cf636f 100644 --- a/llvm/lib/CAS/BuiltinCAS.h +++ b/llvm/lib/CAS/BuiltinCAS.h @@ -1,4 +1,4 @@ -//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,6 +15,9 @@ namespace llvm::cas { class ActionCache; +namespace ondisk { +class UnifiedOnDiskCache; +} // namespace ondisk namespace builtin { /// Common base class for builtin CAS implementations using the same CASContext. @@ -65,9 +68,27 @@ class BuiltinCAS : public ObjectStore { "corrupt storage"); } - Error validate(const CASID &ID) final; + Error validateObject(const CASID &ID) final; }; +/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing. +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +createBuiltinUnifiedOnDiskCache(StringRef Path); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +// FIXME: Proxy not portable. Maybe also error-prone? +constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default"; +constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default"; + } // end namespace builtin } // end namespace llvm::cas diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp new file mode 100644 index 0000000000000..f3f6fa043bc52 --- /dev/null +++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "BuiltinCAS.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" + +using namespace llvm; +using namespace llvm::cas; + +Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>> +cas::createOnDiskUnifiedCASDatabases(StringRef Path) { + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; + if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB)) + return std::move(E); + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB)); + return std::make_pair(std::move(CAS), std::move(AC)); +} + +Expected<ValidationResult> cas::validateOnDiskUnifiedCASDatabasesIfNeeded( + StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinary) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::validateIfNeeded( + Path, builtin::BuiltinCASContext::getHashName(), + sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation, + LLVMCasBinary); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt index a2f8c49e50145..aad77dce370d8 100644 --- a/llvm/lib/CAS/CMakeLists.txt +++ b/llvm/lib/CAS/CMakeLists.txt @@ -2,15 +2,18 @@ add_llvm_component_library(LLVMCAS ActionCache.cpp ActionCaches.cpp BuiltinCAS.cpp + BuiltinUnifiedCASDatabases.cpp DatabaseFile.cpp InMemoryCAS.cpp MappedFileRegionArena.cpp ObjectStore.cpp + OnDiskCAS.cpp OnDiskCommon.cpp OnDiskDataAllocator.cpp OnDiskGraphDB.cpp OnDiskKeyValueDB.cpp OnDiskTrieRawHashMap.cpp + UnifiedOnDiskCache.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp index c63ee70de0849..2d4eedd5bdc8f 100644 --- a/llvm/lib/CAS/InMemoryCAS.cpp +++ b/llvm/lib/CAS/InMemoryCAS.cpp @@ -233,6 +233,12 @@ class InMemoryCAS : public BuiltinCAS { return cast<InMemoryObject>(asInMemoryObject(Node)).getData(); } + void print(raw_ostream &OS) const final; + + Error validate(bool CheckHash) const final { + return createStringError("InMemoryCAS doesn't support validate()"); + } + InMemoryCAS() = default; private: @@ -271,6 +277,8 @@ ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const { return cast<InMemoryInlineObject>(this)->getRefsImpl(); } +void InMemoryCAS::print(raw_ostream &OS) const {} + Expected<ObjectRef> InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, sys::fs::mapped_file_region Map) { diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp index e0be50bbe013a..3110577e03774 100644 --- a/llvm/lib/CAS/ObjectStore.cpp +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -1,4 +1,4 @@ -//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,7 +12,7 @@ #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" -#include <optional> +#include <deque> using namespace llvm; using namespace llvm::cas; @@ -21,6 +21,7 @@ void CASContext::anchor() {} void ObjectStore::anchor() {} LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); } @@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) { auto [I, Inserted] = ValidatedRefs.insert(Ref); if (!Inserted) continue; // already validated. - if (Error E = validate(getID(Ref))) + if (Error E = validateObject(getID(Ref))) return E; Expected<ObjectHandle> Obj = load(Ref); if (!Obj) @@ -155,6 +156,92 @@ Error ObjectStore::validateTree(ObjectRef Root) { return Error::success(); } +Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream, + ObjectRef Other) { + // Copy the full CAS tree from upstream with depth-first ordering to ensure + // all the child nodes are available in downstream CAS before inserting + // current object. This uses a similar algorithm as + // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema + // so it can be used to import from any other ObjectStore reguardless of the + // CAS schema. + + // There is no work to do if importing from self. + if (this == &Upstream) + return Other; + + /// Keeps track of the state of visitation for current node and all of its + /// parents. Upstream Cursor holds information only from upstream CAS. + struct UpstreamCursor { + ObjectRef Ref; + ObjectHandle Node; + size_t RefsCount; + std::deque<ObjectRef> Refs; + }; + SmallVector<UpstreamCursor, 16> CursorStack; + /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either + /// just stored in the CAS or nodes already exists in the current CAS. + SmallVector<ObjectRef, 128> PrimaryRefStack; + /// A map from upstream ObjectRef to current ObjectRef. + llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects; + + auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) { + unsigned NumRefs = Upstream.getNumRefs(Node); + std::deque<ObjectRef> Refs; + for (unsigned I = 0; I < NumRefs; ++I) + Refs.push_back(Upstream.readRef(Node, I)); + + CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)}); + }; + + auto UpstreamHandle = Upstream.load(Other); + if (!UpstreamHandle) + return UpstreamHandle.takeError(); + enqueueNode(Other, *UpstreamHandle); + + while (!CursorStack.empty()) { + UpstreamCursor &Cur = CursorStack.back(); + if (Cur.Refs.empty()) { + // Copy the node data into the primary store. + // The bottom of \p PrimaryRefStack contains the ObjectRef for the + // current node. + assert(PrimaryRefStack.size() >= Cur.RefsCount); + auto Refs = ArrayRef(PrimaryRefStack) + .slice(PrimaryRefStack.size() - Cur.RefsCount); + auto NewNode = store(Refs, Upstream.getData(Cur.Node)); + if (!NewNode) + return NewNode.takeError(); + + // Remove the current node and its IDs from the stack. + PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount); + CursorStack.pop_back(); + + PrimaryRefStack.push_back(*NewNode); + CreatedObjects.try_emplace(Cur.Ref, *NewNode); + continue; + } + + // Check if the node exists already. + auto CurrentID = Cur.Refs.front(); + Cur.Refs.pop_front(); + auto Ref = CreatedObjects.find(CurrentID); + if (Ref != CreatedObjects.end()) { + // If exists already, just need to enqueue the primary node. + PrimaryRefStack.push_back(Ref->second); + continue; + } + + // Load child. + auto PrimaryID = Upstream.load(CurrentID); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + + enqueueNode(CurrentID, *PrimaryID); + } + + assert(PrimaryRefStack.size() == 1); + return PrimaryRefStack.front(); +} + std::unique_ptr<MemoryBuffer> ObjectProxy::getMemoryBuffer(StringRef Name, bool RequiresNullTerminator) const { diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp new file mode 100644 index 0000000000000..7d29f4499211e --- /dev/null +++ b/llvm/lib/CAS/OnDiskCAS.cpp @@ -0,0 +1,211 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Error.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class OnDiskCAS : public BuiltinCAS { +public: + Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) final; + + Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final; + + CASID getID(ObjectRef Ref) const final; + + std::optional<ObjectRef> getReference(const CASID &ID) const final; + + Expected<bool> isMaterialized(ObjectRef Ref) const final; + + ArrayRef<char> getDataConst(ObjectHandle Node) const final; + + void print(raw_ostream &OS) const final; + Error validate(bool CheckHash) const final; + + static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path); + + OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) + : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {} + +private: + ObjectHandle convertHandle(ondisk::ObjectHandle Node) const { + return makeObjectHandle(Node.getOpaqueData()); + } + + ondisk::ObjectHandle convertHandle(ObjectHandle Node) const { + return ondisk::ObjectHandle(Node.getInternalRef(*this)); + } + + ObjectRef convertRef(ondisk::ObjectID Ref) const { + return makeObjectRef(Ref.getOpaqueData()); + } + + ondisk::ObjectID convertRef(ObjectRef Ref) const { + return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this)); + } + + size_t getNumRefs(ObjectHandle Node) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return std::distance(RefsRange.begin(), RefsRange.end()); + } + + ObjectRef readRef(ObjectHandle Node, size_t I) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return convertRef(RefsRange.begin()[I]); + } + + Error forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const final; + + Error setSizeLimit(std::optional<uint64_t> SizeLimit) final; + Expected<std::optional<uint64_t>> getStorageSize() const final; + Error pruneStorageData() final; + + OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> GraphDB) + : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {} + + std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB; + std::shared_ptr<ondisk::UnifiedOnDiskCache> UnifiedDB; + ondisk::OnDiskGraphDB *DB; +}; + +} // end anonymous namespace + +void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); } +Error OnDiskCAS::validate(bool CheckHash) const { + auto Hasher = [](ArrayRef<ArrayRef<uint8_t>> Refs, ArrayRef<char> Data, + SmallVectorImpl<uint8_t> &Result) { + auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject( + Refs, Data); + Result.assign(Hash.begin(), Hash.end()); + }; + + if (auto E = DB->validate(CheckHash, Hasher)) + return E; + + return Error::success(); +} + +CASID OnDiskCAS::getID(ObjectRef Ref) const { + ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref)); + return CASID::create(&getContext(), toStringRef(Hash)); +} + +std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const { + std::optional<ondisk::ObjectID> ObjID = + DB->getExistingReference(ID.getHash()); + if (!ObjID) + return std::nullopt; + return convertRef(*ObjID); +} + +Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const { + return DB->isMaterialized(convertRef(ExternalRef)); +} + +ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const { + return DB->getObjectData(convertHandle(Node)); +} + +Expected<std::optional<ObjectHandle>> +OnDiskCAS::loadIfExists(ObjectRef ExternalRef) { + Expected<std::optional<ondisk::ObjectHandle>> ObjHnd = + DB->load(convertRef(ExternalRef)); + if (!ObjHnd) + return ObjHnd.takeError(); + if (!*ObjHnd) + return std::nullopt; + return convertHandle(**ObjHnd); +} + +Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) { + SmallVector<ondisk::ObjectID, 64> IDs; + IDs.reserve(Refs.size()); + for (ObjectRef Ref : Refs) { + IDs.push_back(convertRef(Ref)); + } + + auto StoredID = DB->getReference(ComputedHash); + if (LLVM_UNLIKELY(!StoredID)) + return StoredID.takeError(); + if (Error E = DB->store(*StoredID, IDs, Data)) + return std::move(E); + return convertRef(*StoredID); +} + +Error OnDiskCAS::forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + for (ondisk::ObjectID Ref : RefsRange) { + if (Error E = Callback(convertRef(Ref))) + return E; + } + return Error::success(); +} + +Error OnDiskCAS::setSizeLimit(std::optional<uint64_t> SizeLimit) { + UnifiedDB->setSizeLimit(SizeLimit); + return Error::success(); +} + +Expected<std::optional<uint64_t>> OnDiskCAS::getStorageSize() const { + return UnifiedDB->getStorageSize(); +} + +Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); } + +Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) { + Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB = + ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(), + sizeof(HashType)); + if (!DB) + return DB.takeError(); + return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB))); +} + +bool cas::isOnDiskCASEnabled() { +#if LLVM_ENABLE_ONDISK_CAS + return true; +#else + return false; +#endif +} + +Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) { +#if LLVM_ENABLE_ONDISK_CAS + // FIXME: An absolute path isn't really good enough. Should open a directory + // and use openat() for files underneath. + SmallString<256> AbsPath; + Path.toVector(AbsPath); + sys::fs::make_absolute(AbsPath); + + return OnDiskCAS::open(AbsPath); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled"); +#endif /* LLVM_ENABLE_ONDISK_CAS */ +} + +std::unique_ptr<ObjectStore> +cas::builtin::createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { + return std::make_unique<OnDiskCAS>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp index 64cbe9dc8e159..245b6fb832549 100644 --- a/llvm/lib/CAS/OnDiskGraphDB.cpp +++ b/llvm/lib/CAS/OnDiskGraphDB.cpp @@ -893,6 +893,10 @@ int64_t DataRecordHandle::getDataRelOffset() const { } Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const { + if (UpstreamDB) { + if (auto E = UpstreamDB->validate(Deep, Hasher)) + return E; + } return Index.validate([&](FileOffset Offset, OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { @@ -1202,11 +1206,8 @@ OnDiskGraphDB::load(ObjectID ExternalRef) { return I.takeError(); TrieRecord::Data Object = I->Ref.load(); - if (Object.SK == TrieRecord::StorageKind::Unknown) { - if (!UpstreamDB) - return std::nullopt; + if (Object.SK == TrieRecord::StorageKind::Unknown) return faultInFromUpstream(ExternalRef); - } if (Object.SK == TrieRecord::StorageKind::DataPool) return ObjectHandle::fromFileOffset(Object.Offset); @@ -1286,8 +1287,10 @@ OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef, TrieRecord::Data Object = I->Ref.load(); if (Object.SK != TrieRecord::StorageKind::Unknown) return ObjectPresence::InPrimaryDB; + if (!CheckUpstream || !UpstreamDB) return ObjectPresence::Missing; + std::optional<ObjectID> UpstreamID = UpstreamDB->getExistingReference(getDigest(*I)); return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB @@ -1549,9 +1552,10 @@ unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const { return std::max(IndexPercent, DataPercent); } -Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( - StringRef AbsPath, StringRef HashName, unsigned HashByteSize, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) { +Expected<std::unique_ptr<OnDiskGraphDB>> +OnDiskGraphDB::open(StringRef AbsPath, StringRef HashName, + unsigned HashByteSize, OnDiskGraphDB *UpstreamDB, + FaultInPolicy Policy) { if (std::error_code EC = sys::fs::create_directories(AbsPath)) return createFileError(AbsPath, EC); @@ -1604,18 +1608,15 @@ Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( "unexpected user header in '" + DataPoolPath + "'"); - return std::unique_ptr<OnDiskGraphDB>( - new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool), - std::move(UpstreamDB), Policy)); + return std::unique_ptr<OnDiskGraphDB>(new OnDiskGraphDB( + AbsPath, std::move(*Index), std::move(*DataPool), UpstreamDB, Policy)); } OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index, OnDiskDataAllocator DataPool, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, - FaultInPolicy Policy) + OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy) : Index(std::move(Index)), DataPool(std::move(DataPool)), - RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)), - FIPolicy(Policy) { + RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy) { /// Lifetime for "big" objects not in DataPool. /// /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something @@ -1638,7 +1639,6 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID, // against the process dying during importing and leaving the database with an // incomplete tree. Note that if the upstream has missing nodes then the tree // will be copied with missing nodes as well, it won't be considered an error. - struct UpstreamCursor { ObjectHandle Node; size_t RefsCount; @@ -1720,7 +1720,6 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, // Copy the node data into the primary store. // FIXME: Use hard-link or cloning if the file-system supports it and data is // stored into a separate file. - auto Data = UpstreamDB->getObjectData(UpstreamNode); auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode); SmallVector<ObjectID, 64> Refs; @@ -1737,7 +1736,8 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, Expected<std::optional<ObjectHandle>> OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) { - assert(UpstreamDB); + if (!UpstreamDB) + return std::nullopt; auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID)); if (LLVM_UNLIKELY(!UpstreamID)) diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp index 21860717da3bf..15656cb38a5e5 100644 --- a/llvm/lib/CAS/OnDiskKeyValueDB.cpp +++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp @@ -20,6 +20,7 @@ #include "llvm/CAS/OnDiskKeyValueDB.h" #include "OnDiskCommon.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Errc.h" @@ -53,15 +54,21 @@ Expected<std::optional<ArrayRef<char>>> OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) { // Check the result cache. OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key); - if (!ActionP) + if (ActionP) { + assert(isAddrAligned(Align(8), ActionP->Data.data())); + return ActionP->Data; + } + if (!UnifiedCache || !UnifiedCache->UpstreamKVDB) return std::nullopt; - assert(isAddrAligned(Align(8), ActionP->Data.data())); - return ActionP->Data; + + // Try to fault in from upstream. + return UnifiedCache->faultInFromUpstreamKV(Key); } Expected<std::unique_ptr<OnDiskKeyValueDB>> OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, - StringRef ValueName, size_t ValueSize) { + StringRef ValueName, size_t ValueSize, + UnifiedOnDiskCache *Cache) { if (std::error_code EC = sys::fs::create_directories(Path)) return createFileError(Path, EC); @@ -87,10 +94,14 @@ OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, return std::move(E); return std::unique_ptr<OnDiskKeyValueDB>( - new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache))); + new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache), Cache)); } Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const { + if (UnifiedCache && UnifiedCache->UpstreamKVDB) { + if (auto E = UnifiedCache->UpstreamKVDB->validate(CheckValue)) + return E; + } return Cache.validate( [&](FileOffset Offset, OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp new file mode 100644 index 0000000000000..ae9d818241f4b --- /dev/null +++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp @@ -0,0 +1,613 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one +/// directory while also restricting storage growth with a scheme of chaining +/// the two most recent directories (primary & upstream), where the primary +/// "faults-in" data from the upstream one. When the primary (most recent) +/// directory exceeds its intended limit a new empty directory becomes the +/// primary one. +/// +/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open +/// receives) there are directories named like this: +/// +/// 'v<version>.<x>' +/// 'v<version>.<x+1>' +/// 'v<version>.<x+2>' +/// ... +/// +/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and +/// the part after the dot is an increasing integer. The primary directory is +/// the one with the highest integer and the upstream one is the directory +/// before it. For example, if the sub-directories contained are: +/// +/// 'v1.5', 'v1.6', 'v1.7', 'v1.8' +/// +/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are +/// unused directories that can be safely deleted at any time and by any +/// process. +/// +/// Contained within the top-level directory is a file named "lock" which is +/// used for processes to take shared or exclusive locks for the contents of the +/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock +/// for the top-level directory; when it closes, if the primary sub-directory +/// exceeded its limit, it attempts to get an exclusive lock in order to create +/// a new empty primary directory; if it can't get the exclusive lock it gives +/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt +/// again. +/// +/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a +/// directory, by any process, the storage size in that directory will keep +/// growing unrestricted. But the major benefit is that garbage-collection can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers in the same process or other +/// processes. +/// +/// The \c UnifiedOnDiskCache also provides validation and recovery on top of +/// the underlying on-disk storage. The low-level storage is designed to remain +/// coherent across regular process crashes, but may be invalid after power loss +/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows +/// validating the contents once per boot and can recover by marking invalid +/// data for garbage collection. +/// +/// The data recovery described above requires exclusive access to the CAS, and +/// it is an error to attempt recovery if the CAS is open in any process/thread. +/// In order to maximize backwards compatibility with tools that do not perform +/// validation before opening the CAS, we do not attempt to get exclusive access +/// until recovery is actually performed, meaning as long as the data is valid +/// it will not conflict with concurrent use. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "BuiltinCAS.h" +#include "OnDiskCommon.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/raw_ostream.h" +#include <optional> + +#if __has_include(<sys/sysctl.h>) +#include <sys/sysctl.h> +#endif + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out +/// how to handle the leftover sub-directories of the previous version, within +/// the \p UnifiedOnDiskCache::collectGarbage function. +static constexpr StringLiteral DBDirPrefix = "v1."; + +static constexpr StringLiteral ValidationFilename = "v1.validation"; +static constexpr StringLiteral CorruptPrefix = "corrupt."; + +ObjectID UnifiedOnDiskCache::getObjectIDFromValue(ArrayRef<char> Value) { + // little endian encoded. + assert(Value.size() == sizeof(uint64_t)); + return ObjectID::fromOpaqueData(support::endian::read64le(Value.data())); +} + +UnifiedOnDiskCache::ValueBytes +UnifiedOnDiskCache::getValueFromObjectID(ObjectID ID) { + // little endian encoded. + UnifiedOnDiskCache::ValueBytes ValBytes; + static_assert(ValBytes.size() == sizeof(ID.getOpaqueData())); + support::endian::write64le(ValBytes.data(), ID.getOpaqueData()); + return ValBytes; +} + +Expected<std::optional<ArrayRef<char>>> +UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) { + assert(UpstreamGraphDB); + assert(UpstreamKVDB); + + std::optional<ArrayRef<char>> UpstreamValue; + if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue)) + return std::move(E); + if (!UpstreamValue) + return std::nullopt; + + // The value is the \p ObjectID in the context of the upstream + // \p OnDiskGraphDB instance. Translate it to the context of the primary + // \p OnDiskGraphDB instance. + ObjectID UpstreamID = getObjectIDFromValue(*UpstreamValue); + auto PrimaryID = + PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID)); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + return PrimaryKVDB->put(Key, getValueFromObjectID(*PrimaryID)); +} + +/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with +/// ascending order of the integer after the dot. Corrupt directories, if +/// included, will come first. +static Expected<SmallVector<std::string, 4>> +getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) { + struct DBDir { + uint64_t Order; + std::string Name; + }; + SmallVector<DBDir> FoundDBDirs; + + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() != sys::fs::file_type::directory_file) + continue; + StringRef SubDir = sys::path::filename(DirI->path()); + if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) { + FoundDBDirs.push_back({0, std::string(SubDir)}); + continue; + } + if (!SubDir.starts_with(DBDirPrefix)) + continue; + uint64_t Order; + if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order)) + return createStringError(inconvertibleErrorCode(), + "unexpected directory " + DirI->path()); + FoundDBDirs.push_back({Order, std::string(SubDir)}); + } + if (EC) + return createFileError(Path, EC); + + llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool { + return LHS.Order <= RHS.Order; + }); + + SmallVector<std::string, 4> DBDirs; + for (DBDir &Dir : FoundDBDirs) + DBDirs.push_back(std::move(Dir.Name)); + return DBDirs; +} + +static Expected<SmallVector<std::string, 4>> getAllGarbageDirs(StringRef Path) { + auto DBDirs = getAllDBDirs(Path, /*IncludeCorrupt=*/true); + if (!DBDirs) + return DBDirs.takeError(); + + // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure + // out how to handle the leftover sub-directories of the previous version. + + for (unsigned Keep = 2; Keep > 0 && !DBDirs->empty(); --Keep) { + StringRef Back(DBDirs->back()); + if (Back.starts_with(CorruptPrefix)) + break; + DBDirs->pop_back(); + } + return *DBDirs; +} + +/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the +/// 'v<version>.<x+1>' name. +static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) { + assert(DBDir.starts_with(DBDirPrefix)); + uint64_t Count; + bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count); + assert(!Failed); + (void)Failed; + OS << DBDirPrefix << Count + 1; +} + +static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath, + bool CheckHash) { + SmallVector<StringRef> Args{LLVMCasBinary, "-cas", RootPath, "-validate"}; + if (CheckHash) + Args.push_back("-check-hash"); + + llvm::SmallString<128> StdErrPath; + int StdErrFD = -1; + if (std::error_code EC = sys::fs::createTemporaryFile( + "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath, + llvm::sys::fs::OF_Text)) + return createStringError(EC, "failed to create temporary file"); + FileRemover OutputRemover(StdErrPath.c_str()); + + std::optional<llvm::StringRef> Redirects[] = { + {""}, // stdin = /dev/null + {""}, // stdout = /dev/null + StdErrPath.str(), + }; + + std::string ErrMsg; + int Result = + sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects, + /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg); + + if (Result == -1) + return createStringError("failed to exec " + join(Args, " ") + ": " + + ErrMsg); + if (Result != 0) { + llvm::SmallString<64> Err("cas contents invalid"); + if (!ErrMsg.empty()) { + Err += ": "; + Err += ErrMsg; + } + auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str()); + if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) { + Err += ": "; + Err += (*StdErrBuf)->getBuffer(); + } + return createStringError(Err); + } + return Error::success(); +} + +static Error validateInProcess(StringRef RootPath, StringRef HashName, + unsigned HashByteSize, bool CheckHash) { + std::shared_ptr<UnifiedOnDiskCache> UniDB; + if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName, + HashByteSize) + .moveInto(UniDB)) + return E; + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + if (Error E = CAS->validate(CheckHash)) + return E; + auto Cache = builtin::createActionCacheFromUnifiedOnDiskCache(UniDB); + if (Error E = Cache->validate()) + return E; + return Error::success(); +} + +static Expected<uint64_t> getBootTime() { +#if __has_include(<sys/sysctl.h>) && defined(KERN_BOOTTIME) + struct timeval TV; + size_t TVLen = sizeof(TV); + int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME}; + if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0) + return createStringError(llvm::errnoAsErrorCode(), + "failed to get boottime"); + if (TVLen != sizeof(TV)) + return createStringError("sysctl kern.boottime unexpected format"); + return TV.tv_sec; +#elif defined(__linux__) + // Use the mtime for /proc, which is recreated during system boot. + // We could also read /proc/stat and search for 'btime'. + sys::fs::file_status Status; + if (std::error_code EC = sys::fs::status("/proc", Status)) + return createFileError("/proc", EC); + return Status.getLastModificationTime().time_since_epoch().count(); +#else + llvm::report_fatal_error("getBootTime unimplemented"); +#endif +} + +Expected<ValidationResult> UnifiedOnDiskCache::validateIfNeeded( + StringRef RootPath, StringRef HashName, unsigned HashByteSize, + bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinaryPath) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, ValidationFilename); + int FD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(FD != -1); + + sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); + auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); }); + + if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive)) + return createFileError(PathBuf, EC); + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); }); + + SmallString<8> Bytes; + if (Error E = sys::fs::readNativeFileToEOF(File, Bytes)) + return createFileError(PathBuf, std::move(E)); + + uint64_t ValidationBootTime = 0; + if (!Bytes.empty() && + StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime)) + return createFileError(PathBuf, errc::illegal_byte_sequence, + "expected integer"); + + static uint64_t BootTime = 0; + if (BootTime == 0) + if (Error E = getBootTime().moveInto(BootTime)) + return std::move(E); + + std::string LogValidationError; + + if (ValidationBootTime == BootTime && !ForceValidation) + return ValidationResult::Skipped; + + // Validate! + bool NeedsRecovery = false; + if (Error E = + LLVMCasBinaryPath + ? validateOutOfProcess(*LLVMCasBinaryPath, RootPath, CheckHash) + : validateInProcess(RootPath, HashName, HashByteSize, + CheckHash)) { + if (AllowRecovery) { + consumeError(std::move(E)); + NeedsRecovery = true; + } else { + return std::move(E); + } + } + + if (NeedsRecovery) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, "lock"); + + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); }); + if (std::error_code EC = tryLockFileThreadSafe(LockFD)) { + if (EC == std::errc::no_lock_available) + return createFileError( + PathBuf, EC, + "CAS validation requires exclusive access but CAS was in use"); + return createFileError(PathBuf, EC); + } + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + auto DBDirs = getAllDBDirs(RootPath); + if (!DBDirs) + return DBDirs.takeError(); + + for (StringRef DBDir : *DBDirs) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, DBDir); + std::error_code EC; + int Attempt = 0, MaxAttempts = 100; + SmallString<128> GCPath; + for (; Attempt < MaxAttempts; ++Attempt) { + GCPath.assign(RootPath); + sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) + + "." + DBDir); + EC = sys::fs::rename(PathBuf, GCPath); + // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST. + if (EC != errc::directory_not_empty && EC != errc::file_exists) + break; + } + if (Attempt == MaxAttempts) + return createStringError( + EC, "rename " + PathBuf + + " failed: too many CAS directories awaiting pruning"); + if (EC) + return createStringError(EC, "rename " + PathBuf + " to " + GCPath + + " failed: " + EC.message()); + } + } + + if (ValidationBootTime != BootTime) { + // Fix filename in case we have error to report. + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, ValidationFilename); + if (std::error_code EC = sys::fs::resize_file(FD, 0)) + return createFileError(PathBuf, EC); + raw_fd_ostream OS(FD, /*shouldClose=*/false); + OS.seek(0); // resize does not reset position + OS << BootTime << '\n'; + if (OS.has_error()) + return createFileError(PathBuf, OS.error()); + } + + return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid; +} + +Expected<std::unique_ptr<UnifiedOnDiskCache>> +UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit, + StringRef HashName, unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, "lock"); + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(LockFD != -1); + // Locking the directory using shared lock, which will prevent other processes + // from creating a new chain (essentially while a \p UnifiedOnDiskCache + // instance holds a shared lock the storage for the primary directory will + // grow unrestricted). + if (std::error_code EC = + lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared)) + return createFileError(PathBuf, EC); + + auto DBDirs = getAllDBDirs(RootPath); + if (!DBDirs) + return DBDirs.takeError(); + if (DBDirs->empty()) + DBDirs->push_back((Twine(DBDirPrefix) + "1").str()); + + assert(!DBDirs->empty()); + + /// If there is only one directory open databases on it. If there are 2 or + /// more directories, get the most recent directories and chain them, with the + /// most recent being the primary one. The remaining directories are unused + /// data than can be garbage-collected. + auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache()); + std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB; + std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; + if (DBDirs->size() > 1) { + StringRef UpstreamDir = *(DBDirs->end() - 2); + PathBuf = RootPath; + sys::path::append(PathBuf, UpstreamDir); + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + /*UpstreamDB=*/nullptr, FaultInPolicy) + .moveInto(UpstreamGraphDB)) + return std::move(E); + if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t)) + .moveInto(UpstreamKVDB)) + return std::move(E); + } + + StringRef PrimaryDir = *(DBDirs->end() - 1); + PathBuf = RootPath; + sys::path::append(PathBuf, PrimaryDir); + std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + UpstreamGraphDB.get(), FaultInPolicy) + .moveInto(PrimaryGraphDB)) + return std::move(E); + std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; + // \p UnifiedOnDiskCache does manual chaining for key-value requests, + // including an extra translation step of the value during fault-in. + if (Error E = + OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t), UniDB.get()) + .moveInto(PrimaryKVDB)) + return std::move(E); + + UniDB->RootPath = RootPath; + UniDB->SizeLimit = SizeLimit.value_or(0); + UniDB->LockFD = LockFD; + UniDB->NeedsGarbageCollection = DBDirs->size() > 2; + UniDB->PrimaryDBDir = PrimaryDir; + UniDB->UpstreamGraphDB = std::move(UpstreamGraphDB); + UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB); + UniDB->UpstreamKVDB = std::move(UpstreamKVDB); + UniDB->PrimaryKVDB = std::move(PrimaryKVDB); + + return std::move(UniDB); +} + +void UnifiedOnDiskCache::setSizeLimit(std::optional<uint64_t> SizeLimit) { + this->SizeLimit = SizeLimit.value_or(0); +} + +uint64_t UnifiedOnDiskCache::getStorageSize() const { + uint64_t TotalSize = getPrimaryStorageSize(); + if (UpstreamGraphDB) + TotalSize += UpstreamGraphDB->getStorageSize(); + if (UpstreamKVDB) + TotalSize += UpstreamKVDB->getStorageSize(); + return TotalSize; +} + +uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const { + return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize(); +} + +bool UnifiedOnDiskCache::hasExceededSizeLimit() const { + uint64_t CurSizeLimit = SizeLimit; + if (!CurSizeLimit) + return false; + + // If the hard limit is beyond 85%, declare above limit and request clean up. + unsigned CurrentPercent = + std::max(PrimaryGraphDB->getHardStorageLimitUtilization(), + PrimaryKVDB->getHardStorageLimitUtilization()); + if (CurrentPercent > 85) + return true; + + // We allow each of the directories in the chain to reach up to half the + // intended size limit. Check whether the primary directory has exceeded half + // the limit or not, in order to decide whether we need to start a new chain. + // + // We could check the size limit against the sum of sizes of both the primary + // and upstream directories but then if the upstream is significantly larger + // than the intended limit, it would trigger a new chain to be created before + // the primary has reached its own limit. Essentially in such situation we + // prefer reclaiming the storage later in order to have more consistent cache + // hits behavior. + return (CurSizeLimit / 2) < getPrimaryStorageSize(); +} + +Error UnifiedOnDiskCache::close(bool CheckSizeLimit) { + if (LockFD == -1) + return Error::success(); // already closed. + auto CloseLock = make_scope_exit([&]() { + assert(LockFD >= 0); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + sys::fs::closeFile(LockFile); + LockFD = -1; + }); + + bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false; + UpstreamKVDB.reset(); + PrimaryKVDB.reset(); + UpstreamGraphDB.reset(); + PrimaryGraphDB.reset(); + if (std::error_code EC = unlockFileThreadSafe(LockFD)) + return createFileError(RootPath, EC); + + if (!ExceededSizeLimit) + return Error::success(); + + // The primary directory exceeded its intended size limit. Try to get an + // exclusive lock in order to create a new primary directory for next time + // this \p UnifiedOnDiskCache path is opened. + + if (std::error_code EC = tryLockFileThreadSafe( + LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) { + if (EC == errc::no_lock_available) + return Error::success(); // couldn't get exclusive lock, give up. + return createFileError(RootPath, EC); + } + auto UnlockFile = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + // Managed to get an exclusive lock which means there are no other open + // \p UnifiedOnDiskCache instances for the same path, so we can safely start a + // new primary directory. To start a new primary directory we just have to + // create a new empty directory with the next consecutive index; since this is + // an atomic operation we will leave the top-level directory in a consistent + // state even if the process dies during this code-path. + + SmallString<256> PathBuf(RootPath); + raw_svector_ostream OS(PathBuf); + OS << sys::path::get_separator(); + getNextDBDirName(PrimaryDBDir, OS); + if (std::error_code EC = sys::fs::create_directory(PathBuf)) + return createFileError(PathBuf, EC); + + NeedsGarbageCollection = true; + return Error::success(); +} + +UnifiedOnDiskCache::UnifiedOnDiskCache() = default; + +UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); } + +Error UnifiedOnDiskCache::collectGarbage(StringRef Path) { + auto DBDirs = getAllGarbageDirs(Path); + if (!DBDirs) + return DBDirs.takeError(); + + SmallString<256> PathBuf(Path); + for (StringRef UnusedSubDir : *DBDirs) { + sys::path::append(PathBuf, UnusedSubDir); + if (std::error_code EC = sys::fs::remove_directories(PathBuf)) + return createFileError(PathBuf, EC); + sys::path::remove_filename(PathBuf); + } + return Error::success(); +} + +Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); } diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp index db67e30ca203b..692da230b6e09 100644 --- a/llvm/unittests/CAS/ActionCacheTest.cpp +++ b/llvm/unittests/CAS/ActionCacheTest.cpp @@ -21,7 +21,7 @@ using namespace llvm; using namespace llvm::cas; TEST_P(CASTest, ActionCacheHit) { - std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ObjectStore> CAS = createObjectStore(); std::unique_ptr<ActionCache> Cache = createActionCache(); std::optional<ObjectProxy> ID; @@ -36,7 +36,7 @@ TEST_P(CASTest, ActionCacheHit) { } TEST_P(CASTest, ActionCacheMiss) { - std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ObjectStore> CAS = createObjectStore(); std::unique_ptr<ActionCache> Cache = createActionCache(); std::optional<ObjectProxy> ID1, ID2; @@ -59,7 +59,7 @@ TEST_P(CASTest, ActionCacheMiss) { } TEST_P(CASTest, ActionCacheRewrite) { - std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ObjectStore> CAS = createObjectStore(); std::unique_ptr<ActionCache> Cache = createActionCache(); std::optional<ObjectProxy> ID1, ID2; diff --git a/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp new file mode 100644 index 0000000000000..19522e9372d85 --- /dev/null +++ b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "CASTestConfig.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::cas; + +TEST_F(OnDiskCASTest, UnifiedCASMaterializationCheckPreventsGarbageCollection) { + unittest::TempDir Temp("on-disk-unified-cas", /*Unique=*/true); + + auto WithCAS = [&](llvm::function_ref<void(ObjectStore &)> Action) { + std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>> DBs; + ASSERT_THAT_ERROR( + createOnDiskUnifiedCASDatabases(Temp.path()).moveInto(DBs), + Succeeded()); + ObjectStore &CAS = *DBs.first; + ASSERT_THAT_ERROR(CAS.setSizeLimit(1), Succeeded()); + Action(CAS); + }; + + std::optional<CASID> ID; + + // Create an object in the CAS. + WithCAS([&ID](ObjectStore &CAS) { + std::optional<ObjectRef> Ref; + ASSERT_THAT_ERROR(CAS.store({}, "blah").moveInto(Ref), Succeeded()); + ASSERT_TRUE(Ref.has_value()); + + ID = CAS.getID(*Ref); + }); + + // Check materialization and prune the storage. + WithCAS([&ID](ObjectStore &CAS) { + std::optional<ObjectRef> Ref = CAS.getReference(*ID); + ASSERT_TRUE(Ref.has_value()); + + std::optional<bool> IsMaterialized; + ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized), + Succeeded()); + ASSERT_TRUE(IsMaterialized); + + ASSERT_THAT_ERROR(CAS.pruneStorageData(), Succeeded()); + }); + + // Verify that the previous materialization check kept the object in the CAS. + WithCAS([&ID](ObjectStore &CAS) { + std::optional<ObjectRef> Ref = CAS.getReference(*ID); + ASSERT_TRUE(Ref.has_value()); + + std::optional<bool> IsMaterialized; + ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized), + Succeeded()); + ASSERT_TRUE(IsMaterialized); + }); +} diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp index 10e4b689e151e..08cbf1daf727d 100644 --- a/llvm/unittests/CAS/CASTestConfig.cpp +++ b/llvm/unittests/CAS/CASTestConfig.cpp @@ -8,6 +8,7 @@ #include "CASTestConfig.h" #include "llvm/CAS/ObjectStore.h" +#include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" #include <mutex> @@ -15,7 +16,8 @@ using namespace llvm; using namespace llvm::cas; static CASTestingEnv createInMemory(int I) { - return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache()}; + return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache(), + std::nullopt}; } INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest, @@ -23,7 +25,7 @@ INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest, #if LLVM_ENABLE_ONDISK_CAS namespace llvm::cas::ondisk { -extern void setMaxMappingSize(uint64_t Size); +void setMaxMappingSize(uint64_t Size); } // namespace llvm::cas::ondisk void setMaxOnDiskCASMappingSize() { @@ -31,6 +33,17 @@ void setMaxOnDiskCASMappingSize() { std::call_once( Flag, [] { llvm::cas::ondisk::setMaxMappingSize(100 * 1024 * 1024); }); } + +static CASTestingEnv createOnDisk(int I) { + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + std::unique_ptr<ObjectStore> CAS; + EXPECT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + std::unique_ptr<ActionCache> Cache; + EXPECT_THAT_ERROR(createOnDiskActionCache(Temp.path()).moveInto(Cache), + Succeeded()); + return CASTestingEnv{std::move(CAS), std::move(Cache), std::move(Temp)}; +} +INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk)); #else void setMaxOnDiskCASMappingSize() {} #endif /* LLVM_ENABLE_ONDISK_CAS */ diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h index 8d3c55305f1b3..b1c0e59ff2b92 100644 --- a/llvm/unittests/CAS/CASTestConfig.h +++ b/llvm/unittests/CAS/CASTestConfig.h @@ -6,16 +6,28 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H +#define LLVM_UNITTESTS_CASTESTCONFIG_H + #include "llvm/CAS/ActionCache.h" #include "llvm/CAS/ObjectStore.h" +#include "llvm/Testing/Support/SupportHelpers.h" #include "gtest/gtest.h" +#include <memory> -#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H -#define LLVM_UNITTESTS_CASTESTCONFIG_H +namespace llvm::unittest::cas { +class MockEnv { + void anchor(); + +public: + virtual ~MockEnv(); +}; +} // namespace llvm::unittest::cas struct CASTestingEnv { std::unique_ptr<llvm::cas::ObjectStore> CAS; std::unique_ptr<llvm::cas::ActionCache> Cache; + std::optional<llvm::unittest::TempDir> Temp; }; void setMaxOnDiskCASMappingSize(); @@ -24,26 +36,47 @@ void setMaxOnDiskCASMappingSize(); class OnDiskCASTest : public ::testing::Test { protected: void SetUp() override { +#if !LLVM_ENABLE_ONDISK_CAS + GTEST_SKIP() << "OnDiskCAS is not enabled"; +#endif // Use a smaller database size for testing to conserve disk space. setMaxOnDiskCASMappingSize(); } }; +// Parametered test fixture for ObjectStore and ActionCache tests. class CASTest : public testing::TestWithParam<std::function<CASTestingEnv(int)>> { protected: std::optional<int> NextCASIndex; + llvm::SmallVector<llvm::unittest::TempDir> Dirs; + + llvm::SmallVector<std::unique_ptr<llvm::unittest::cas::MockEnv>> Envs; + std::unique_ptr<llvm::cas::ObjectStore> createObjectStore() { auto TD = GetParam()(++(*NextCASIndex)); + if (TD.Temp) + Dirs.push_back(std::move(*TD.Temp)); return std::move(TD.CAS); } std::unique_ptr<llvm::cas::ActionCache> createActionCache() { auto TD = GetParam()(++(*NextCASIndex)); + if (TD.Temp) + Dirs.push_back(std::move(*TD.Temp)); return std::move(TD.Cache); } - void SetUp() override { NextCASIndex = 0; } - void TearDown() override { NextCASIndex = std::nullopt; } + + void SetUp() override { + NextCASIndex = 0; + setMaxOnDiskCASMappingSize(); + } + + void TearDown() override { + NextCASIndex = std::nullopt; + Dirs.clear(); + Envs.clear(); + } }; #endif diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt index da469f7fccb5a..91e49be770745 100644 --- a/llvm/unittests/CAS/CMakeLists.txt +++ b/llvm/unittests/CAS/CMakeLists.txt @@ -1,9 +1,11 @@ set(ONDISK_CAS_TEST_SOURCES + BuiltinUnifiedCASDatabasesTest.cpp OnDiskGraphDBTest.cpp OnDiskDataAllocatorTest.cpp OnDiskKeyValueDBTest.cpp OnDiskTrieRawHashMapTest.cpp ProgramTest.cpp + UnifiedOnDiskCacheTest.cpp ) set(LLVM_OPTIONAL_SOURCES diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp index 54083fdb408f6..b43ae33d74127 100644 --- a/llvm/unittests/CAS/ObjectStoreTest.cpp +++ b/llvm/unittests/CAS/ObjectStoreTest.cpp @@ -1,4 +1,4 @@ -//===- ObjectStoreTest.cpp ------------------------------------------------===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -76,7 +76,7 @@ multiline text multiline text multiline text multiline text multiline text)", // Run validation on all CASIDs. for (int I = 0, E = IDs.size(); I != E; ++I) - ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded()); + ASSERT_THAT_ERROR(CAS1->validateObject(IDs[I]), Succeeded()); // Check that the blobs can be retrieved multiple times. for (int I = 0, E = IDs.size(); I != E; ++I) { @@ -120,15 +120,15 @@ TEST_P(CASTest, BlobsBig) { std::optional<CASID> ID2; ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID1), Succeeded()); ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID2), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded()); ASSERT_EQ(ID1, ID2); String1.append(String2); ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID1), Succeeded()); ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID2), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded()); ASSERT_EQ(ID1, ID2); String2.append(String1); } @@ -176,10 +176,11 @@ multiline text multiline text multiline text multiline text multiline text)", // Check basic printing of IDs. IDs.push_back(CAS1->getID(*Node)); - auto ID = CAS1->getID(Nodes.back()); - EXPECT_EQ(ID.toString(), IDs.back().toString()); - EXPECT_EQ(*Node, Nodes.back()); - EXPECT_EQ(ID, IDs.back()); + EXPECT_EQ(IDs.back().toString(), IDs.back().toString()); + EXPECT_EQ(Nodes.front(), Nodes.front()); + EXPECT_EQ(Nodes.back(), Nodes.back()); + EXPECT_EQ(IDs.front(), IDs.front()); + EXPECT_EQ(IDs.back(), IDs.back()); if (Nodes.size() <= 1) continue; EXPECT_NE(Nodes.front(), Nodes.back()); @@ -266,7 +267,7 @@ TEST_P(CASTest, NodesBig) { } for (auto ID : CreatedNodes) - ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(CAS->getID(ID)), Succeeded()); } #if LLVM_ENABLE_THREADS @@ -332,17 +333,124 @@ static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) { } TEST_P(CASTest, BlobsParallel) { - std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ObjectStore> CAS = createObjectStore(); uint64_t Size = 1ULL * 1024; ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size)); } #ifdef EXPENSIVE_CHECKS TEST_P(CASTest, BlobsBigParallel) { - std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ObjectStore> CAS = createObjectStore(); // 100k is large enough to be standalone files in our on-disk cas. uint64_t Size = 100ULL * 1024; ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size)); } #endif // EXPENSIVE_CHECKS + +#ifndef _WIN32 // create_link won't work for directories on Windows +TEST_F(OnDiskCASTest, OnDiskCASBlobsParallelMultiCAS) { + // This test intentionally uses symlinked paths to the same CAS to subvert the + // shared memory mappings that would normally be created within a single + // process. This breaks the lock file guarantees, so we must be careful not + // to create or destroy the CAS objects concurrently, which is when the locks + // are normally important. + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")), + std::error_code()); + + std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4), + Succeeded()); + + uint64_t Size = 1ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size)); +} + +TEST_F(OnDiskCASTest, OnDiskCASBlobsBigParallelMultiCAS) { + // See comment in BlobsParallelMultiCAS. + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")), + std::error_code()); + + std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4), + Succeeded()); + + // 100k is large enough to be standalone files in our on-disk cas. + uint64_t Size = 100ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size)); +} +#endif // _WIN32 #endif // LLVM_ENABLE_THREADS + +TEST_F(OnDiskCASTest, OnDiskCASDiskSize) { + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + std::unique_ptr<ObjectStore> CAS; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + + uint64_t MaxSize = 100 * 1024 * 1024; + + // Check that we map the files to the correct size. + auto CheckFileSizes = [&](bool Mapped) { + bool FoundIndex = false, FoundData = false; + std::error_code EC; + for (sys::fs::directory_iterator I(Temp.path(), EC), E; I != E && !EC; + I.increment(EC)) { + StringRef Filename = sys::path::filename(I->path()); + if (Filename.starts_with("index.") && !Filename.ends_with(".shared")) { + FoundIndex = true; + ASSERT_TRUE(I->status()); + if (Mapped) + EXPECT_EQ(I->status()->getSize(), MaxSize); + else + EXPECT_LT(I->status()->getSize(), MaxSize); + } + if (Filename.starts_with("data.") && !Filename.ends_with(".shared")) { + FoundData = true; + ASSERT_TRUE(I->status()); + if (Mapped) + EXPECT_EQ(I->status()->getSize(), MaxSize); + else + EXPECT_LT(I->status()->getSize(), MaxSize); + } + } + ASSERT_TRUE(FoundIndex); + ASSERT_TRUE(FoundData); + }; + + // Check that we have the full mapping size when the CAS is open. + CheckFileSizes(/*Mapped=*/true); + CAS.reset(); + // Check that the CAS is shrunk to a smaller size. + CheckFileSizes(/*Mapped=*/false); + + // Repeat the checks when starting from an existing CAS. + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + CheckFileSizes(/*Mapped=*/true); + CAS.reset(); + CheckFileSizes(/*Mapped=*/false); +} diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h index 89f93e08366c9..48a1830f9b219 100644 --- a/llvm/unittests/CAS/OnDiskCommonUtils.h +++ b/llvm/unittests/CAS/OnDiskCommonUtils.h @@ -12,6 +12,8 @@ #include "llvm/CAS/BuiltinObjectHasher.h" #include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/BLAKE3.h" #include "llvm/Testing/Support/Error.h" @@ -58,6 +60,25 @@ inline Expected<ObjectID> store(OnDiskGraphDB &DB, StringRef Data, return ID; } +inline Expected<ObjectID> cachePut(OnDiskKeyValueDB &DB, ArrayRef<uint8_t> Key, + ObjectID ID) { + auto Value = UnifiedOnDiskCache::getValueFromObjectID(ID); + auto Result = DB.put(Key, Value); + if (!Result) + return Result.takeError(); + return UnifiedOnDiskCache::getObjectIDFromValue(*Result); +} + +inline Expected<std::optional<ObjectID>> cacheGet(OnDiskKeyValueDB &DB, + ArrayRef<uint8_t> Key) { + auto Result = DB.get(Key); + if (!Result) + return Result.takeError(); + if (!*Result) + return std::nullopt; + return UnifiedOnDiskCache::getObjectIDFromValue(**Result); +} + inline Error printTree(OnDiskGraphDB &DB, ObjectID ID, raw_ostream &OS, unsigned Indent = 0) { std::optional<ondisk::ObjectHandle> Obj; diff --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp index 3c2e96318a5ed..e9c73bfb6c8d3 100644 --- a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp +++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp @@ -102,7 +102,7 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInSingleNode) { std::unique_ptr<OnDiskGraphDB> DB; ASSERT_THAT_ERROR( OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType), - std::move(UpstreamDB), + UpstreamDB.get(), OnDiskGraphDB::FaultInPolicy::SingleNode) .moveInto(DB), Succeeded()); @@ -208,7 +208,7 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInFullTree) { unittest::TempDir Temp("ondiskcas", /*Unique=*/true); std::unique_ptr<OnDiskGraphDB> DB; ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType), - std::move(UpstreamDB), + UpstreamDB.get(), OnDiskGraphDB::FaultInPolicy::FullTree) .moveInto(DB), Succeeded()); @@ -264,14 +264,14 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInPolicyConflict) { unittest::TempDir Temp("ondiskcas", /*Unique=*/true); std::unique_ptr<OnDiskGraphDB> DB; ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", - sizeof(HashType), - std::move(UpstreamDB), Policy1) + sizeof(HashType), UpstreamDB.get(), + Policy1) .moveInto(DB), Succeeded()); DB.reset(); ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", - sizeof(HashType), - std::move(UpstreamDB), Policy2) + sizeof(HashType), UpstreamDB.get(), + Policy2) .moveInto(DB), Failed()); }; diff --git a/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp new file mode 100644 index 0000000000000..09aebc2d4bc19 --- /dev/null +++ b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp @@ -0,0 +1,198 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "CASTestConfig.h" +#include "OnDiskCommonUtils.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; +using namespace llvm::unittest::cas; + +/// Visits all the files of a directory recursively and returns the sum of their +/// sizes. +static Expected<size_t> countFileSizes(StringRef Path) { + size_t TotalSize = 0; + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() == sys::fs::file_type::directory_file) { + Expected<size_t> Subsize = countFileSizes(DirI->path()); + if (!Subsize) + return Subsize.takeError(); + TotalSize += *Subsize; + continue; + } + ErrorOr<sys::fs::basic_file_status> Stat = DirI->status(); + if (!Stat) + return createFileError(DirI->path(), Stat.getError()); + TotalSize += Stat->getSize(); + } + if (EC) + return createFileError(Path, EC); + return TotalSize; +} + +TEST_F(OnDiskCASTest, UnifiedOnDiskCacheTest) { + unittest::TempDir Temp("ondisk-unified", /*Unique=*/true); + std::unique_ptr<UnifiedOnDiskCache> UniDB; + + const uint64_t SizeLimit = 1024ull * 64; + auto reopenDB = [&]() { + UniDB.reset(); + ASSERT_THAT_ERROR(UnifiedOnDiskCache::open(Temp.path(), SizeLimit, "blake3", + sizeof(HashType)) + .moveInto(UniDB), + Succeeded()); + }; + + reopenDB(); + + HashType RootHash; + HashType OtherHash; + HashType Key1Hash; + HashType Key2Hash; + { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional<ObjectID> ID1; + ASSERT_THAT_ERROR(store(DB, "1", {}).moveInto(ID1), Succeeded()); + std::optional<ObjectID> ID2; + ASSERT_THAT_ERROR(store(DB, "2", {}).moveInto(ID2), Succeeded()); + std::optional<ObjectID> IDRoot; + ASSERT_THAT_ERROR(store(DB, "root", {*ID1, *ID2}).moveInto(IDRoot), + Succeeded()); + ArrayRef<uint8_t> Digest = DB.getDigest(*IDRoot); + ASSERT_EQ(Digest.size(), RootHash.size()); + llvm::copy(Digest, RootHash.data()); + + std::optional<ObjectID> IDOther; + ASSERT_THAT_ERROR(store(DB, "other", {}).moveInto(IDOther), Succeeded()); + Digest = DB.getDigest(*IDOther); + ASSERT_EQ(Digest.size(), OtherHash.size()); + llvm::copy(Digest, OtherHash.data()); + + Key1Hash = digest("key1"); + std::optional<ObjectID> Val; + ASSERT_THAT_ERROR( + cachePut(UniDB->getKeyValueDB(), Key1Hash, *IDRoot).moveInto(Val), + Succeeded()); + EXPECT_EQ(IDRoot, Val); + + Key2Hash = digest("key2"); + std::optional<ObjectID> KeyID; + ASSERT_THAT_ERROR(DB.getReference(Key2Hash).moveInto(KeyID), Succeeded()); + ASSERT_THAT_ERROR(cachePut(UniDB->getKeyValueDB(), + UniDB->getGraphDB().getDigest(*KeyID), *ID1) + .moveInto(Val), + Succeeded()); + } + + auto checkTree = [&](const HashType &Digest, StringRef ExpectedTree) { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional<ObjectID> ID; + ASSERT_THAT_ERROR(DB.getReference(Digest).moveInto(ID), Succeeded()); + std::string PrintedTree; + raw_string_ostream OS(PrintedTree); + ASSERT_THAT_ERROR(printTree(DB, *ID, OS), Succeeded()); + EXPECT_EQ(PrintedTree, ExpectedTree); + }; + auto checkRootTree = [&]() { + return checkTree(RootHash, "root\n 1\n 2\n"); + }; + + auto checkKey = [&](const HashType &Key, StringRef ExpectedData) { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional<ObjectID> Val; + ASSERT_THAT_ERROR(cacheGet(UniDB->getKeyValueDB(), Key).moveInto(Val), + Succeeded()); + + ASSERT_TRUE(Val.has_value()); + std::optional<ondisk::ObjectHandle> Obj; + ASSERT_THAT_ERROR(DB.load(*Val).moveInto(Obj), Succeeded()); + EXPECT_EQ(toStringRef(DB.getObjectData(*Obj)), ExpectedData); + }; + + checkRootTree(); + checkTree(OtherHash, "other\n"); + checkKey(Key1Hash, "root"); + checkKey(Key2Hash, "1"); + + auto storeBigObject = [&](unsigned Index) { + SmallString<1000> Buf; + Buf.append(970, 'a'); + raw_svector_ostream(Buf) << Index; + std::optional<ObjectID> ID; + ASSERT_THAT_ERROR(store(UniDB->getGraphDB(), Buf, {}).moveInto(ID), + Succeeded()); + }; + + uint64_t PrevStoreSize = UniDB->getStorageSize(); + unsigned Index = 0; + while (!UniDB->hasExceededSizeLimit()) { + storeBigObject(Index++); + } + EXPECT_GT(UniDB->getStorageSize(), PrevStoreSize); + UniDB->setSizeLimit(SizeLimit * 2); + EXPECT_FALSE(UniDB->hasExceededSizeLimit()); + UniDB->setSizeLimit(SizeLimit); + EXPECT_TRUE(UniDB->hasExceededSizeLimit()); + + reopenDB(); + + EXPECT_FALSE(UniDB->hasExceededSizeLimit()); + EXPECT_FALSE(UniDB->needsGarbageCollection()); + + checkRootTree(); + checkKey(Key1Hash, "root"); + + while (!UniDB->hasExceededSizeLimit()) { + storeBigObject(Index++); + } + PrevStoreSize = UniDB->getStorageSize(); + ASSERT_THAT_ERROR(UniDB->close(), Succeeded()); + EXPECT_TRUE(UniDB->needsGarbageCollection()); + + reopenDB(); + EXPECT_TRUE(UniDB->needsGarbageCollection()); + + std::optional<size_t> DirSizeBefore; + ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeBefore), + Succeeded()); + + ASSERT_THAT_ERROR(UnifiedOnDiskCache::collectGarbage(Temp.path()), + Succeeded()); + + std::optional<size_t> DirSizeAfter; + ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeAfter), + Succeeded()); + EXPECT_LT(*DirSizeAfter, *DirSizeBefore); + + reopenDB(); + EXPECT_FALSE(UniDB->needsGarbageCollection()); + + checkRootTree(); + checkKey(Key1Hash, "root"); + + EXPECT_LT(UniDB->getStorageSize(), PrevStoreSize); + + // 'Other' tree and 'Key2' got garbage-collected. + { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional<ObjectID> ID; + ASSERT_THAT_ERROR(DB.getReference(OtherHash).moveInto(ID), Succeeded()); + EXPECT_FALSE(DB.containsObject(*ID)); + std::optional<ObjectID> Val; + ASSERT_THAT_ERROR(cacheGet(UniDB->getKeyValueDB(), Key2Hash).moveInto(Val), + Succeeded()); + EXPECT_FALSE(Val.has_value()); + } +} From 10349ca139068f6589f47369e1d48d06aeb66ad0 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com> Date: Mon, 3 Nov 2025 17:50:55 +0000 Subject: [PATCH 046/313] [gn build] Port 6747ea050dfc --- llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn | 3 +++ llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn | 2 ++ 2 files changed, 5 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn index 5590b27ac3784..1e0e918d3370c 100644 --- a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn @@ -4,14 +4,17 @@ static_library("CAS") { "ActionCache.cpp", "ActionCaches.cpp", "BuiltinCAS.cpp", + "BuiltinUnifiedCASDatabases.cpp", "DatabaseFile.cpp", "InMemoryCAS.cpp", "MappedFileRegionArena.cpp", "ObjectStore.cpp", + "OnDiskCAS.cpp", "OnDiskCommon.cpp", "OnDiskDataAllocator.cpp", "OnDiskGraphDB.cpp", "OnDiskKeyValueDB.cpp", "OnDiskTrieRawHashMap.cpp", + "UnifiedOnDiskCache.cpp", ] } diff --git a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn index 2d9eb6814c376..b10e0e6706cc3 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn @@ -9,8 +9,10 @@ unittest("CASTests") { ] sources = [ "ActionCacheTest.cpp", + "BuiltinUnifiedCASDatabasesTest.cpp", "CASTestConfig.cpp", "ObjectStoreTest.cpp", + "UnifiedOnDiskCacheTest.cpp", ] if (llvm_enable_ondisk_cas) { From 2de5a17e9c8e427c75565574e8bdb5b62c35fec6 Mon Sep 17 00:00:00 2001 From: Amr Hesham <amr96@programmer.net> Date: Mon, 3 Nov 2025 19:08:27 +0100 Subject: [PATCH 047/313] [CIR] Upstream FPToFP Builtin CeilOp (#166052) Upstream the FPToFP Builtin CeilOp --- clang/include/clang/CIR/Dialect/IR/CIROps.td | 10 ++++++++++ clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 11 +++++++++++ clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 8 ++++++++ clang/test/CIR/CodeGen/builtins-floating-point.c | 11 +++++++++-- 4 files changed, 38 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 2b361ed0982c6..dc56db1bbd4ea 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -4171,6 +4171,16 @@ def CIR_ATanOp : CIR_UnaryFPToFPBuiltinOp<"atan", "ATanOp"> { }]; } +def CIR_CeilOp : CIR_UnaryFPToFPBuiltinOp<"ceil", "FCeilOp"> { + let summary = "Computes the ceiling of the specified value"; + let description = [{ + `cir.ceil` computes the ceiling of a given value and returns a result + of the same type. + + Floating-point exceptions are ignored, and it does not set `errno`. + }]; +} + def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> { let summary = "Computes the floating-point cosine value"; let description = [{ diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index e35100ffe4b6b..d9b9e3b877b50 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -211,6 +211,17 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, assert(!cir::MissingFeatures::fastMathFlags()); return emitUnaryMaybeConstrainedFPBuiltin<cir::CosOp>(*this, *e); + case Builtin::BIceil: + case Builtin::BIceilf: + case Builtin::BIceill: + case Builtin::BI__builtin_ceil: + case Builtin::BI__builtin_ceilf: + case Builtin::BI__builtin_ceilf16: + case Builtin::BI__builtin_ceill: + case Builtin::BI__builtin_ceilf128: + assert(!cir::MissingFeatures::fastMathFlags()); + return emitUnaryMaybeConstrainedFPBuiltin<cir::CeilOp>(*this, *e); + case Builtin::BIfabs: case Builtin::BIfabsf: case Builtin::BIfabsl: diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 5a6193fa8d840..d94108294a9a3 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -1336,6 +1336,14 @@ mlir::LogicalResult CIRToLLVMATanOpLowering::matchAndRewrite( return mlir::success(); } +mlir::LogicalResult CIRToLLVMCeilOpLowering::matchAndRewrite( + cir::CeilOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + mlir::Type resTy = typeConverter->convertType(op.getType()); + rewriter.replaceOpWithNewOp<mlir::LLVM::FCeilOp>(op, resTy, adaptor.getSrc()); + return mlir::success(); +} + mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite( cir::AllocaOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { diff --git a/clang/test/CIR/CodeGen/builtins-floating-point.c b/clang/test/CIR/CodeGen/builtins-floating-point.c index 193cc172d37d2..8bdc43c59dc6f 100644 --- a/clang/test/CIR/CodeGen/builtins-floating-point.c +++ b/clang/test/CIR/CodeGen/builtins-floating-point.c @@ -7,14 +7,21 @@ float cosf(float f) { return __builtin_cosf(f); - // CHECK: %{{.*}} = cir.cos {{.*}} : !cir.float + // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.float // LLVM: %{{.*}} = call float @llvm.cos.f32(float %{{.*}}) // OGCG: %{{.*}} = call float @llvm.cos.f32(float %{{.*}}) } double cos(double f) { return __builtin_cos(f); - // CIR: {{.+}} = cir.cos {{.+}} : !cir.double + // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.double // LLVM: %{{.*}} = call double @llvm.cos.f64(double %{{.*}}) // OGCG: %{{.*}} = call double @llvm.cos.f64(double %{{.*}}) } + +float ceil(float f) { + return __builtin_ceilf(f); + // CIR: %{{.*}} = cir.ceil %{{.*}} : !cir.float + // LLVM: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}}) + // OGCG: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}}) +} From d65e712e30a8998c897a6454e4eaea4f974bf765 Mon Sep 17 00:00:00 2001 From: Cyndy Ishida <cyndy_ishida@apple.com> Date: Mon, 3 Nov 2025 10:09:18 -0800 Subject: [PATCH 048/313] [clang] Make "__GCC_HAVE_DWARF2_CFI_ASM" a proper predefined macro (#165731) Use a flag to determine whether this macro should be set when intializing the preprocessor. This macro was added to the driver in 9d117e7b2a399a9b2bcf53fb9b9c0946e82dc75c because it can be conditionally disabled, but before that, the flag to gate behavior was removed under the assumption it wasn't conditional in b5b622a03c5136fa10d245dbe1f8f278ebd98d1b. This patch is to connect the macro with the preexisting flag --- clang-tools-extra/test/pp-trace/pp-trace-include.cpp | 1 - clang-tools-extra/test/pp-trace/pp-trace-macro.cpp | 1 - clang/include/clang/Basic/DebugOptions.def | 2 ++ clang/include/clang/Driver/Options.td | 8 ++++++-- clang/lib/Driver/ToolChains/Clang.cpp | 9 ++++++--- clang/lib/Frontend/InitPreprocessor.cpp | 3 +++ clang/test/DebugInfo/KeyInstructions/flag.cpp | 3 +++ clang/test/Preprocessor/unwind-tables.c | 2 ++ 8 files changed, 22 insertions(+), 7 deletions(-) diff --git a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp index ea9896e1cfde2..fccbd9b3740bd 100644 --- a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp +++ b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp @@ -39,7 +39,6 @@ // CHECK-NEXT: Reason: EnterFile // CHECK-NEXT: FileType: C_User // CHECK-NEXT: PrevFID: (invalid) -// CHECK: - Callback: MacroDefined // CHECK: - Callback: FileChanged // CHECK-NEXT: Loc: "<built-in>:1:1" // CHECK-NEXT: Reason: ExitFile diff --git a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp index 7c2a231101070..5bd38e0dade28 100644 --- a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp +++ b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp @@ -40,7 +40,6 @@ X // CHECK-NEXT: MacroNameTok: __STDC_EMBED_EMPTY__ // CHECK-NEXT: MacroDirective: MD_Define // CHECK: - Callback: MacroDefined -// CHECK: - Callback: MacroDefined // CHECK-NEXT: MacroNameTok: MACRO // CHECK-NEXT: MacroDirective: MD_Define // CHECK-NEXT: - Callback: MacroExpands diff --git a/clang/include/clang/Basic/DebugOptions.def b/clang/include/clang/Basic/DebugOptions.def index a768b12fa4e0d..ea3636ffa1af1 100644 --- a/clang/include/clang/Basic/DebugOptions.def +++ b/clang/include/clang/Basic/DebugOptions.def @@ -46,6 +46,8 @@ ENUM_DEBUGOPT(EmitDwarfUnwind, EmitDwarfUnwindType, 2, DEBUGOPT(NoDwarfDirectoryAsm , 1, 0, Benign) ///< Set when -fno-dwarf-directory-asm ///< is enabled. +DEBUGOPT(Dwarf2CFIAsm, 1, 0, NotCompatible) ///< Set when -fdwarf2-cfi-asm is enabled. + DEBUGOPT(NoInlineLineTables, 1, 0, Benign) ///< Whether debug info should contain ///< inline line tables. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4778b87b789a9..5cf332783cbc3 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2155,8 +2155,12 @@ defm dollars_in_identifiers : BoolFOption<"dollars-in-identifiers", PosFlag<SetTrue, [], [ClangOption], "Allow">, NegFlag<SetFalse, [], [ClangOption], "Disallow">, BothFlags<[], [ClangOption, CC1Option], " '$' in identifiers">>; -def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group<clang_ignored_f_Group>; -def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group<clang_ignored_f_Group>; + +defm dwarf2_cfi_asm + : BoolFOption<"dwarf2-cfi-asm", CodeGenOpts<"Dwarf2CFIAsm">, DefaultFalse, + PosFlag<SetTrue, [], [ClangOption, CC1Option]>, + NegFlag<SetFalse>>; + defm dwarf_directory_asm : BoolFOption<"dwarf-directory-asm", CodeGenOpts<"NoDwarfDirectoryAsm">, DefaultFalse, NegFlag<SetTrue, [], [ClangOption, CC1Option]>, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index d3ab6f1261ad6..30d3e5293a31b 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7879,10 +7879,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, !TC.getTriple().isAndroid() && TC.useIntegratedAs())) CmdArgs.push_back("-faddrsig"); - if ((Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) && + const bool HasDefaultDwarf2CFIASM = + (Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) && (EH || UnwindTables || AsyncUnwindTables || - DebugInfoKind != llvm::codegenoptions::NoDebugInfo)) - CmdArgs.push_back("-D__GCC_HAVE_DWARF2_CFI_ASM=1"); + DebugInfoKind != llvm::codegenoptions::NoDebugInfo); + if (Args.hasFlag(options::OPT_fdwarf2_cfi_asm, + options::OPT_fno_dwarf2_cfi_asm, HasDefaultDwarf2CFIASM)) + CmdArgs.push_back("-fdwarf2-cfi-asm"); if (Arg *A = Args.getLastArg(options::OPT_fsymbol_partition_EQ)) { std::string Str = A->getAsString(Args); diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index ed3f1f93d25d3..b88d9f89c5f71 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1516,6 +1516,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI, if (LangOpts.PointerAuthIntrinsics) Builder.defineMacro("__PTRAUTH__"); + if (CGOpts.Dwarf2CFIAsm) + Builder.defineMacro("__GCC_HAVE_DWARF2_CFI_ASM"); + // Get other target #defines. TI.getTargetDefines(LangOpts, Builder); } diff --git a/clang/test/DebugInfo/KeyInstructions/flag.cpp b/clang/test/DebugInfo/KeyInstructions/flag.cpp index 6aeeed664135e..4a4a5c4c142a7 100644 --- a/clang/test/DebugInfo/KeyInstructions/flag.cpp +++ b/clang/test/DebugInfo/KeyInstructions/flag.cpp @@ -8,6 +8,9 @@ // KEY-INSTRUCTIONS: "-gkey-instructions" // NO-KEY-INSTRUCTIONS-NOT: key-instructions + +// Only expect one dwarf related flag. +// NO-DEBUG: -fdwarf2-cfi-asm // NO-DEBUG-NOT: debug-info-kind // NO-DEBUG-NOT: dwarf diff --git a/clang/test/Preprocessor/unwind-tables.c b/clang/test/Preprocessor/unwind-tables.c index 0a863d79adbf6..5ff990d0c40a6 100644 --- a/clang/test/Preprocessor/unwind-tables.c +++ b/clang/test/Preprocessor/unwind-tables.c @@ -1,11 +1,13 @@ // RUN: %clang %s -dM -E -target x86_64-windows | FileCheck %s --check-prefix=NO // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables | FileCheck %s --check-prefix=NO +// RUN: %clang %s -dM -E -target x86_64 -fno-dwarf2-cfi-asm | FileCheck %s --check-prefix=NO // RUN: %clang %s -dM -E -target x86_64 | FileCheck %s // RUN: %clang %s -dM -E -target x86_64 -funwind-tables -fno-asynchronous-unwind-tables -g | FileCheck %s // RUN: %clang %s -dM -E -target aarch64-apple-darwin | FileCheck %s // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -g | FileCheck %s // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -fexceptions | FileCheck %s +// RUN: %clang %s -dM -E -target x86_64-windows -fdwarf2-cfi-asm | FileCheck %s // NO-NOT: #define __GCC_HAVE_DWARF2_CFI_ASM // CHECK: #define __GCC_HAVE_DWARF2_CFI_ASM 1 From 84a9ed25e816a51df765770e8867bccbde8da8e9 Mon Sep 17 00:00:00 2001 From: Marco Maia <marcogmaia@gmail.com> Date: Mon, 3 Nov 2025 15:15:32 -0300 Subject: [PATCH 049/313] [clangd] Preserve qualified names in "override pure virtual methods" tweak (#163726) Prevents the tweak from splitting **qualified names** (e.g., `foo::Type`) by incorrectly inserting a space around the scope resolution (`::`). **Before:** ```cpp // input: virtual foo::Type::func() = 0 // output: foo :: Type :: func() ``` **After:** ```cpp // input: virtual foo::Type::func() = 0 // output: foo::Type::func() ``` --- .../refactor/tweaks/OverridePureVirtuals.cpp | 5 ++- .../tweaks/OverridePureVirtualsTests.cpp | 39 +++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp index 16febeca70809..b557066d979f5 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp @@ -79,7 +79,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/DeclCXX.h" -#include "clang/AST/Type.h" +#include "clang/AST/TypeBase.h" #include "clang/AST/TypeLoc.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" @@ -116,7 +116,8 @@ std::string removePureVirtualSyntax(const std::string &MethodDecl, DeclString += Tk.text(); if (Tk.Kind != tok::l_paren && Next.Kind != tok::comma && - Next.Kind != tok::r_paren && Next.Kind != tok::l_paren) + Next.Kind != tok::r_paren && Next.Kind != tok::l_paren && + Tk.Kind != tok::coloncolon && Next.Kind != tok::coloncolon) DeclString += ' '; } // Trim the last whitespace. diff --git a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp index b7dcbee1650ec..72095ab2f5982 100644 --- a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp +++ b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp @@ -715,6 +715,45 @@ class D : public B { EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied; } +TEST_F(OverridePureVirtualsTests, QualifiedNames) { + constexpr auto Before = R"cpp( +namespace foo { struct S{}; namespace bar { struct S2{}; } } + +class B { +public: + virtual foo::S foo(int var = 0) = 0; + virtual foo::bar::S2 bar(int var = 0) = 0; +}; + +class ^D : public B {}; +)cpp"; + + constexpr auto Expected = R"cpp( +namespace foo { struct S{}; namespace bar { struct S2{}; } } + +class B { +public: + virtual foo::S foo(int var = 0) = 0; + virtual foo::bar::S2 bar(int var = 0) = 0; +}; + +class D : public B { +public: + foo::S foo(int var = 0) override { + // TODO: Implement this pure virtual method. + static_assert(false, "Method `foo` is not implemented."); + } + + foo::bar::S2 bar(int var = 0) override { + // TODO: Implement this pure virtual method. + static_assert(false, "Method `bar` is not implemented."); + } +}; +)cpp"; + auto Applied = apply(Before); + EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied; +} + } // namespace } // namespace clangd } // namespace clang From 425fe3362d49ff1b2958f8ec9cae49e587b57e7e Mon Sep 17 00:00:00 2001 From: Alex Langford <alangford@apple.com> Date: Mon, 3 Nov 2025 10:20:08 -0800 Subject: [PATCH 050/313] [lldb] Fix unaligned writes in ObjectFileELF (#165759) The code to apply relocations was sometimes creating unaligned destination pointers. Instead of giving them an explicit type (i.e. `uint64_t *`) and forcing the compiler to generate unaligned stores, mark the pointer as `void *`. The compiler will figure out the correct series of store instructions. --- lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp index 49841e7307443..e06e69fb08305 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp @@ -2735,9 +2735,8 @@ static void ApplyELF64ABS64Relocation(Symtab *symtab, ELFRelocation &rel, // ObjectFileELF creates a WritableDataBuffer in CreateInstance. WritableDataBuffer *data_buffer = llvm::cast<WritableDataBuffer>(data_buffer_sp.get()); - uint64_t *dst = reinterpret_cast<uint64_t *>( - data_buffer->GetBytes() + rel_section->GetFileOffset() + - ELFRelocation::RelocOffset64(rel)); + void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() + + ELFRelocation::RelocOffset64(rel); uint64_t val_offset = value + ELFRelocation::RelocAddend64(rel); memcpy(dst, &val_offset, sizeof(uint64_t)); } @@ -2762,9 +2761,8 @@ static void ApplyELF64ABS32Relocation(Symtab *symtab, ELFRelocation &rel, // ObjectFileELF creates a WritableDataBuffer in CreateInstance. WritableDataBuffer *data_buffer = llvm::cast<WritableDataBuffer>(data_buffer_sp.get()); - uint32_t *dst = reinterpret_cast<uint32_t *>( - data_buffer->GetBytes() + rel_section->GetFileOffset() + - ELFRelocation::RelocOffset32(rel)); + void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() + + ELFRelocation::RelocOffset32(rel); memcpy(dst, &truncated_addr, sizeof(uint32_t)); } } From ecdd660356323d18b23cbb7d8b9244a2e0662be4 Mon Sep 17 00:00:00 2001 From: Amr Hesham <amr96@programmer.net> Date: Mon, 3 Nov 2025 19:26:46 +0100 Subject: [PATCH 051/313] [clang] Report Diagnostic when builtin vector has negative size (#166055) Report a diagnostic in case vector_size or ext_vector_type attributes are used with a negative size. The same evaluation result can be used for other checks, for example, the too big a size. Issue #165463 --- clang/docs/ReleaseNotes.rst | 3 +++ clang/include/clang/Basic/DiagnosticSemaKinds.td | 2 ++ clang/lib/Sema/SemaType.cpp | 10 ++++++++++ clang/test/SemaCXX/vector.cpp | 13 +++++++++++++ 4 files changed, 28 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c3740a4a027bd..cd272396252d0 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -395,6 +395,9 @@ Improvements to Clang's diagnostics that were previously incorrectly accepted in case of other irrelevant conditions are now consistently diagnosed, identical to C++ mode. +- Clang now emits a diagnostic in case `vector_size` or `ext_vector_type` + attributes are used with a negative size (#GH165463). + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 4e369be0bbb92..fa509536bf021 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3510,6 +3510,8 @@ def err_init_method_bad_return_type : Error< "init methods must return an object pointer type, not %0">; def err_attribute_invalid_size : Error< "vector size not an integral multiple of component size">; +def err_attribute_vec_negative_size + : Error<"vector must have non-negative size">; def err_attribute_zero_size : Error<"zero %0 size">; def err_attribute_size_too_large : Error<"%0 size too large">; def err_typecheck_sve_rvv_ambiguous : Error< diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 280b3c92cce14..682fd258eccf2 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -2358,6 +2358,11 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr, return QualType(); } + if (VecSize->isNegative()) { + Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size); + return QualType(); + } + if (CurType->isDependentType()) return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc, VectorKind::Generic); @@ -2427,6 +2432,11 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize, return QualType(); } + if (vecSize->isNegative()) { + Diag(ArraySize->getExprLoc(), diag::err_attribute_vec_negative_size); + return QualType(); + } + if (!vecSize->isIntN(32)) { Diag(AttrLoc, diag::err_attribute_size_too_large) << ArraySize->getSourceRange() << "vector"; diff --git a/clang/test/SemaCXX/vector.cpp b/clang/test/SemaCXX/vector.cpp index 808bdb679b09c..06195f039cd92 100644 --- a/clang/test/SemaCXX/vector.cpp +++ b/clang/test/SemaCXX/vector.cpp @@ -786,3 +786,16 @@ const long long e = *0; // expected-error {{indirection requires pointer operand double f = a - e; // expected-error {{cannot initialize a variable of type 'double' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(double)))) double' (vector of 1 'double' value)}} int h = c - e; // expected-error {{cannot initialize a variable of type 'int' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(long)))) long' (vector of 1 'long' value)}} } + +typedef int v_neg_size __attribute__((vector_size(-8))); // expected-error{{vector must have non-negative size}} +typedef int v_neg_size_2 __attribute__((vector_size(-1 * 8))); // expected-error{{vector must have non-negative size}} +typedef int v_ext_neg_size __attribute__((ext_vector_type(-8))); // expected-error{{vector must have non-negative size}} +typedef int v_ext_neg_size2 __attribute__((ext_vector_type(-1 * 8))); // expected-error{{vector must have non-negative size}} + + +#if __cplusplus >= 201103L + +template <int N> using templated_v_size = int __attribute__((vector_size(N))); // expected-error{{vector must have non-negative size}} +templated_v_size<-8> templated_v_neg_size; //expected-note{{in instantiation of template type alias 'templated_v_size' requested here}} + +#endif From fa2c5fe21df42accfcd4498aa7e6b2ae943357ea Mon Sep 17 00:00:00 2001 From: Aiden Grossman <aidengrossman@google.com> Date: Mon, 3 Nov 2025 10:50:51 -0800 Subject: [PATCH 052/313] [Github] Pin Remaining Github Actions to SHAs (#166194) We had a couple in the llvm/actions repository that were pinned to main. Pin them to the latest SHA in main to keep them consistent with everything else. These also ensures we are compliant with our own CI best practices and also cleans up the remaining CodeQL findings for this specific issue. --- .github/workflows/hlsl-test-all.yaml | 2 +- .github/workflows/libclang-abi-tests.yml | 2 +- .github/workflows/llvm-abi-tests.yml | 2 +- .github/workflows/new-issues.yml | 2 +- .github/workflows/premerge.yaml | 2 +- .github/workflows/release-binaries.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml index cdc951658b4d2..ce6ccfa23df6a 100644 --- a/.github/workflows/hlsl-test-all.yaml +++ b/.github/workflows/hlsl-test-all.yaml @@ -54,7 +54,7 @@ jobs: path: golden-images - name: Setup Windows if: runner.os == 'Windows' - uses: llvm/actions/setup-windows@main + uses: llvm/actions/setup-windows@42d80571b13f4599bbefbc7189728b64723c7f78 # main with: arch: amd64 - name: Build DXC diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml index 5ccf976848197..432c45744abda 100644 --- a/.github/workflows/libclang-abi-tests.yml +++ b/.github/workflows/libclang-abi-tests.yml @@ -100,7 +100,7 @@ jobs: repo: ${{ github.repository }} steps: - name: Install Ninja - uses: llvm/actions/install-ninja@main + uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - name: Install abi-compliance-checker run: | sudo apt-get update diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml index f73d180bb0005..961f1cc79389d 100644 --- a/.github/workflows/llvm-abi-tests.yml +++ b/.github/workflows/llvm-abi-tests.yml @@ -88,7 +88,7 @@ jobs: repo: ${{ github.repository }} steps: - name: Install Ninja - uses: llvm/actions/install-ninja@main + uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - name: Install abi-compliance-checker run: | sudo apt-get update diff --git a/.github/workflows/new-issues.yml b/.github/workflows/new-issues.yml index 8480a657cc717..a5dcad28dbe24 100644 --- a/.github/workflows/new-issues.yml +++ b/.github/workflows/new-issues.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-24.04 if: github.repository == 'llvm/llvm-project' steps: - - uses: llvm/actions/issue-labeler@main + - uses: llvm/actions/issue-labeler@42d80571b13f4599bbefbc7189728b64723c7f78 # main with: repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }} configuration-path: .github/new-issues-labeler.yml diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml index 6303a119750b5..973d3abf358ce 100644 --- a/.github/workflows/premerge.yaml +++ b/.github/workflows/premerge.yaml @@ -190,7 +190,7 @@ jobs: with: max-size: "2000M" - name: Install Ninja - uses: llvm/actions/install-ninja@main + uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - name: Build and Test run: | source <(git diff --name-only HEAD~1...HEAD | python3 .ci/compute_projects.py) diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index d1a017ab7b553..acc47231e3569 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -177,7 +177,7 @@ jobs: - name: Setup Windows if: startsWith(runner.os, 'Windows') - uses: llvm/actions/setup-windows@main + uses: llvm/actions/setup-windows@42d80571b13f4599bbefbc7189728b64723c7f78 # main with: arch: amd64 From d200df0557b71fae0d77bc7fc1650d22a05af371 Mon Sep 17 00:00:00 2001 From: Aiden Grossman <aidengrossman@google.com> Date: Mon, 3 Nov 2025 11:04:22 -0800 Subject: [PATCH 053/313] [libcxx] Remove Redundant Reset in ~basic_string (#164718) 8dae17be2991cd7f0d7fd9aa5aecd064520a14f6 refactors basic_string for more code reuse. This makes sense in most cases, but has performance overhead in the case of ~basic_string. The refactoring of ~basic_string to call __reset_internal_buffer() added a redundant (inside the destructor) reset of the object, which the optimizer is unable to optimize away in many cases. This patch prevents a ~1% regression we observed on an internal workload when applying the original refactoring. This does slightly pessimize the code readability, but I think this change is worth it given the performance impact. I'm hoping to add a benchmark(s) to the upstream libc++ benchmark suite around string construction/destruction to ensure that this case does not regress as it seems common in real world applications. I will put up a separate PR for that when I figure out a reasonable way to write it. --- libcxx/include/string | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/libcxx/include/string b/libcxx/include/string index 33382c7af4b2c..ede42467b99fe 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -644,6 +644,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len ); # include <__utility/forward.h> # include <__utility/is_pointer_in_range.h> # include <__utility/move.h> +# include <__utility/no_destroy.h> # include <__utility/scope_guard.h> # include <__utility/swap.h> # include <climits> @@ -918,6 +919,7 @@ private: __rep() = default; _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__short __r) : __s(__r) {} _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__long __r) : __l(__r) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__uninitialized_tag) {} }; _LIBCPP_COMPRESSED_PAIR(__rep, __rep_, allocator_type, __alloc_); @@ -1210,7 +1212,10 @@ public: } # endif // _LIBCPP_CXX03_LANG - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(); } + // TODO(boomanaiden154): Once we mark this in destructors as dead on return, + // we can use a normal call to __reset_internal_buffer and remove the extra + // __rep constructor. + inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(__rep(__uninitialized_tag())); } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 operator __self_view() const _NOEXCEPT { return __self_view(typename __self_view::__assume_valid(), data(), size()); From 8fd1bf2f8c9e6e7c4bc5f6915a9d52bb3672601b Mon Sep 17 00:00:00 2001 From: yonghong-song <yhs@fb.com> Date: Mon, 3 Nov 2025 11:11:47 -0800 Subject: [PATCH 054/313] [BPF] Remove unused weak symbol __bpf_trap (#166003) Nikita Popov reported an issue ([1]) where a dangling weak symbol __bpf_trap is in the final binary and this caused libbpf failing like below: $ veristat -v ./t.o Processing 't.o'... libbpf: elf: skipping unrecognized data section(4) .eh_frame libbpf: elf: skipping relo section(5) .rel.eh_frame for section(4) .eh_frame libbpf: failed to find BTF for extern '__bpf_trap': -3 Failed to open './t.o': -3 In llvm, the dag selection phase generates __bpf_trap in code. Later the UnreachableBlockElim pass removed __bpf_trap from the code, but __bpf_trap symbol survives in the symbol table. Having a dangling __bpf_trap weak symbol is not good for old kernels as seen in the above veristat failure. Although users could use compiler flag `-mllvm -bpf-disable-trap-unreachable` to workaround the issue, this patch fixed the issue by removing the dangling __bpf_trap. [1] https://github.com/llvm/llvm-project/issues/165696 --- llvm/lib/Target/BPF/BPFAsmPrinter.cpp | 24 ++++++++++++++++++++ llvm/lib/Target/BPF/BPFAsmPrinter.h | 1 + llvm/test/CodeGen/BPF/bpf_trap.ll | 32 +++++++++++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 llvm/test/CodeGen/BPF/bpf_trap.ll diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index 77dc4a75a7d68..b2a82040ee823 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -88,6 +88,16 @@ bool BPFAsmPrinter::doFinalization(Module &M) { } } + for (GlobalObject &GO : M.global_objects()) { + if (!GO.hasExternalWeakLinkage()) + continue; + + if (!SawTrapCall && GO.getName() == BPF_TRAP) { + GO.eraseFromParent(); + break; + } + } + return AsmPrinter::doFinalization(M); } @@ -160,6 +170,20 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) { + if (MI->isCall()) { + for (const MachineOperand &Op : MI->operands()) { + if (Op.isGlobal()) { + if (const GlobalValue *GV = Op.getGlobal()) + if (GV->getName() == BPF_TRAP) + SawTrapCall = true; + } else if (Op.isSymbol()) { + if (const MCSymbol *Sym = Op.getMCSymbol()) + if (Sym->getName() == BPF_TRAP) + SawTrapCall = true; + } + } + } + BPF_MC::verifyInstructionPredicates(MI->getOpcode(), getSubtargetInfo().getFeatureBits()); diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h index 90ef2073609a6..75a1d7ed9f884 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.h +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h @@ -39,6 +39,7 @@ class BPFAsmPrinter : public AsmPrinter { private: BTFDebug *BTF; TargetMachine &TM; + bool SawTrapCall = false; const BPFTargetMachine &getBTM() const; }; diff --git a/llvm/test/CodeGen/BPF/bpf_trap.ll b/llvm/test/CodeGen/BPF/bpf_trap.ll new file mode 100644 index 0000000000000..ab8df5ff7cb0d --- /dev/null +++ b/llvm/test/CodeGen/BPF/bpf_trap.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s | FileCheck %s +; +target triple = "bpf" + +define i32 @test(i8 %x) { +entry: + %0 = and i8 %x, 3 + switch i8 %0, label %default.unreachable4 [ + i8 0, label %return + i8 1, label %sw.bb1 + i8 2, label %sw.bb2 + i8 3, label %sw.bb3 + ] + +sw.bb1: ; preds = %entry + br label %return + +sw.bb2: ; preds = %entry + br label %return + +sw.bb3: ; preds = %entry + br label %return + +default.unreachable4: ; preds = %entry + unreachable + +return: ; preds = %entry, %sw.bb3, %sw.bb2, %sw.bb1 + %retval.0 = phi i32 [ 12, %sw.bb1 ], [ 43, %sw.bb2 ], [ 54, %sw.bb3 ], [ 32, %entry ] + ret i32 %retval.0 +} + +; CHECK-NOT: __bpf_trap From 0a28c07f2f51b6c94546f8e8e59c4e934ba6c520 Mon Sep 17 00:00:00 2001 From: Tom Stellard <tstellar@redhat.com> Date: Mon, 3 Nov 2025 11:12:23 -0800 Subject: [PATCH 055/313] workflows/release-binaries: Disable LTO/PGO for testing macOS job in PRs (#165801) When a PR is submitted the macos-14 workflow will run with LTO/PGO disabled. This makes it possible to run the workflow on the free runners with the six hour timeout and will allow us to test the workflow on pull requests. --- .github/workflows/release-binaries.yml | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index acc47231e3569..25f426b7814df 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -138,7 +138,6 @@ jobs: target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF" fi - echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT case "${{ inputs.runs-on }}" in ubuntu-22.04*) build_runs_on="depot-${{ inputs.runs-on }}-16" @@ -157,6 +156,23 @@ jobs: build_runs_on=$test_runs_on ;; esac + + case "$build_runs_on" in + # These runners cannot build the full release package faster than + # the 6 hours timeout limit, so we need to use a configuration + # that builds more quickly. + macos-14) + bootstrap_prefix="BOOTSTRAP" + target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF -DLLVM_RELEASE_ENABLE_PGO=OFF" + ;; + *) + bootstrap_prefix="BOOTSTRAP_BOOTSTRAP" + ;; + esac + + target_cmake_flags="$target_cmake_flags -D${bootstrap_prefix}_CPACK_PACKAGE_FILE_NAME=$release_binary_basename" + + echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT echo "build-runs-on=$build_runs_on" >> $GITHUB_OUTPUT echo "test-runs-on=$test_runs_on" >> $GITHUB_OUTPUT @@ -200,8 +216,7 @@ jobs: # so we need to set some extra cmake flags to disable this. cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \ ${{ needs.prepare.outputs.target-cmake-flags }} \ - -C clang/cmake/caches/Release.cmake \ - -DBOOTSTRAP_BOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}" + -C clang/cmake/caches/Release.cmake - name: Build shell: bash From 3a0c534032aac828a12fd2057b00197a12aa90b2 Mon Sep 17 00:00:00 2001 From: Justin Bogner <mail@justinbogner.com> Date: Mon, 3 Nov 2025 11:13:50 -0800 Subject: [PATCH 056/313] [ORC] Fix -Wunused-function warning on windows (#166207) All of the users of this function are guarded by LLVM_ON_UNIX and LLVM_ENABLE_THREADS ifdefs, so wrap the function itself in these guards as well to avoid the unused function warning. --- llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 79216e89c7cba..88d6daf08d35e 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -776,6 +776,7 @@ createSharedMemoryManager(SimpleRemoteEPC &SREPC) { SlabSize, SREPC, SAs); } +#if LLVM_ON_UNIX && LLVM_ENABLE_THREADS static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) { switch (UseMemMgr) { case MemMgr::Default: @@ -789,6 +790,7 @@ static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) { break; } } +#endif static Expected<MaterializationUnit::Interface> getTestObjectFileInterface(Session &S, MemoryBufferRef O) { From 0c707c9713f0657f7208b8f9a95a13af749d95c5 Mon Sep 17 00:00:00 2001 From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com> Date: Mon, 3 Nov 2025 20:15:14 +0100 Subject: [PATCH 057/313] [libc] Add printf error handling (#162876) [#159474](https://github.com/llvm/llvm-project/issues/159474) - All printf variants set errno and consistently return -1 on error, instead of returning various predefined error codes - Return value overflow handling is added --- libc/src/stdio/CMakeLists.txt | 24 +++++++++ libc/src/stdio/asprintf.cpp | 18 ++++++- libc/src/stdio/baremetal/CMakeLists.txt | 8 +++ libc/src/stdio/baremetal/printf.cpp | 23 ++++++-- libc/src/stdio/baremetal/vprintf.cpp | 23 ++++++-- libc/src/stdio/generic/CMakeLists.txt | 4 ++ libc/src/stdio/generic/fprintf.cpp | 17 +++++- libc/src/stdio/generic/printf.cpp | 17 +++++- libc/src/stdio/generic/vfprintf.cpp | 17 +++++- libc/src/stdio/generic/vprintf.cpp | 17 +++++- libc/src/stdio/printf_core/CMakeLists.txt | 25 +++++++++ libc/src/stdio/printf_core/core_structs.h | 19 ++++--- libc/src/stdio/printf_core/error_mapper.h | 21 ++++++++ .../stdio/printf_core/generic/CMakeLists.txt | 8 +++ .../stdio/printf_core/generic/error_mapper.h | 49 +++++++++++++++++ .../stdio/printf_core/linux/CMakeLists.txt | 8 +++ .../stdio/printf_core/linux/error_mapper.h | 54 +++++++++++++++++++ libc/src/stdio/printf_core/printf_main.h | 9 ++-- .../stdio/printf_core/vasprintf_internal.h | 20 +++---- .../src/stdio/printf_core/vfprintf_internal.h | 41 +++++++++----- .../stdio/printf_core/write_int_converter.h | 4 +- libc/src/stdio/printf_core/writer.h | 8 +-- libc/src/stdio/snprintf.cpp | 19 ++++++- libc/src/stdio/sprintf.cpp | 18 ++++++- libc/src/stdio/vasprintf.cpp | 16 +++++- libc/src/stdio/vsnprintf.cpp | 19 ++++++- libc/src/stdio/vsprintf.cpp | 17 +++++- libc/src/stdlib/CMakeLists.txt | 6 +++ libc/src/stdlib/strfromd.cpp | 10 +++- libc/src/stdlib/strfromf.cpp | 10 +++- libc/src/stdlib/strfroml.cpp | 10 +++- libc/src/time/strftime_core/strftime_main.h | 3 +- libc/test/src/stdio/CMakeLists.txt | 2 + libc/test/src/stdio/fprintf_test.cpp | 23 ++++++++ .../src/stdio/printf_core/converter_test.cpp | 30 +++++------ .../src/stdio/printf_core/writer_test.cpp | 32 +++++------ libc/test/src/stdio/snprintf_test.cpp | 15 ++++++ libc/test/src/stdio/vfprintf_test.cpp | 5 ++ libc/test/src/stdlib/StrfromTest.h | 19 ++++++- 39 files changed, 584 insertions(+), 104 deletions(-) create mode 100644 libc/src/stdio/printf_core/error_mapper.h create mode 100644 libc/src/stdio/printf_core/generic/CMakeLists.txt create mode 100644 libc/src/stdio/printf_core/generic/error_mapper.h create mode 100644 libc/src/stdio/printf_core/linux/CMakeLists.txt create mode 100644 libc/src/stdio/printf_core/linux/error_mapper.h diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt index b0a6ef1e291b5..c75c8b11be2b5 100644 --- a/libc/src/stdio/CMakeLists.txt +++ b/libc/src/stdio/CMakeLists.txt @@ -125,6 +125,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -136,6 +140,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -146,6 +154,10 @@ add_entrypoint_object( asprintf.h DEPENDS libc.src.stdio.printf_core.vasprintf_internal + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -157,6 +169,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -168,6 +184,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -178,6 +198,10 @@ add_entrypoint_object( vasprintf.h DEPENDS libc.src.stdio.printf_core.vasprintf_internal + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_subdirectory(printf_core) diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp index f8cfb74ce48ea..083f40c1f19fa 100644 --- a/libc/src/stdio/asprintf.cpp +++ b/libc/src/stdio/asprintf.cpp @@ -7,8 +7,12 @@ //===----------------------------------------------------------------------===// #include "src/stdio/asprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vasprintf_internal.h" namespace LIBC_NAMESPACE_DECL { @@ -22,8 +26,18 @@ LLVM_LIBC_FUNCTION(int, asprintf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - int ret = printf_core::vasprintf_internal(buffer, format, args); - return ret; + auto ret_val = printf_core::vasprintf_internal(buffer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt index 548938f885c94..bfeff0e2b5880 100644 --- a/libc/src/stdio/baremetal/CMakeLists.txt +++ b/libc/src/stdio/baremetal/CMakeLists.txt @@ -29,8 +29,12 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.error_mapper + libc.src.stdio.printf_core.core_structs libc.src.__support.arg_list libc.src.__support.OSUtil.osutil + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -87,8 +91,12 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.error_mapper + libc.src.stdio.printf_core.core_structs libc.src.__support.arg_list libc.src.__support.OSUtil.osutil + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp index 7253c6549a4e4..0c6c9ad338c9f 100644 --- a/libc/src/stdio/baremetal/printf.cpp +++ b/libc/src/stdio/baremetal/printf.cpp @@ -7,10 +7,13 @@ //===----------------------------------------------------------------------===// #include "src/stdio/printf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/io.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -42,13 +45,25 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) { buffer, BUFF_SIZE, &stdout_write_hook, nullptr); printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb); - int retval = printf_core::printf_main(&writer, format, args); + auto retval = printf_core::printf_main(&writer, format, args); + if (!retval.has_value()) { + libc_errno = printf_core::internal_error_to_errno(retval.error()); + return -1; + } int flushval = wb.overflow_write(""); - if (flushval != printf_core::WRITE_OK) - retval = flushval; + if (flushval != printf_core::WRITE_OK) { + libc_errno = printf_core::internal_error_to_errno(-flushval); + return -1; + } - return retval; + if (retval.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(retval.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp index ab02533f14911..d2f586c70ad1c 100644 --- a/libc/src/stdio/baremetal/vprintf.cpp +++ b/libc/src/stdio/baremetal/vprintf.cpp @@ -7,10 +7,13 @@ //===----------------------------------------------------------------------===// #include "src/stdio/vprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/io.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -40,13 +43,25 @@ LLVM_LIBC_FUNCTION(int, vprintf, buffer, BUFF_SIZE, &stdout_write_hook, nullptr); printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb); - int retval = printf_core::printf_main(&writer, format, args); + auto retval = printf_core::printf_main(&writer, format, args); + if (!retval.has_value()) { + libc_errno = printf_core::internal_error_to_errno(retval.error()); + return -1; + } int flushval = wb.overflow_write(""); - if (flushval != printf_core::WRITE_OK) - retval = flushval; + if (flushval != printf_core::WRITE_OK) { + libc_errno = printf_core::internal_error_to_errno(-flushval); + return -1; + } - return retval; + if (retval.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(retval.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt index 6361822b61999..71055edea3d9e 100644 --- a/libc/src/stdio/generic/CMakeLists.txt +++ b/libc/src/stdio/generic/CMakeLists.txt @@ -393,7 +393,11 @@ add_generic_entrypoint_object( list(APPEND fprintf_deps libc.hdr.types.FILE libc.src.__support.arg_list + libc.src.__support.CPP.limits + libc.src.__support.libc_errno libc.src.stdio.printf_core.vfprintf_internal + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper ) if(LLVM_LIBC_FULL_BUILD) diff --git a/libc/src/stdio/generic/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp index 087aeadfc52c5..b07f2528fe11d 100644 --- a/libc/src/stdio/generic/fprintf.cpp +++ b/libc/src/stdio/generic/fprintf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/fprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -27,8 +30,18 @@ LLVM_LIBC_FUNCTION(int, fprintf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - int ret_val = printf_core::vfprintf_internal(stream, format, args); - return ret_val; + auto ret_val = printf_core::vfprintf_internal(stream, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/printf.cpp b/libc/src/stdio/generic/printf.cpp index bb7c7c86f843f..d6d4adcefb3b1 100644 --- a/libc/src/stdio/generic/printf.cpp +++ b/libc/src/stdio/generic/printf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/printf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -31,9 +34,19 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) { // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - int ret_val = printf_core::vfprintf_internal( + auto ret_val = printf_core::vfprintf_internal( reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args); - return ret_val; + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp index 01f4265f118a6..c00352d1dd666 100644 --- a/libc/src/stdio/generic/vfprintf.cpp +++ b/libc/src/stdio/generic/vfprintf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/vfprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -24,8 +27,18 @@ LLVM_LIBC_FUNCTION(int, vfprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - int ret_val = printf_core::vfprintf_internal(stream, format, args); - return ret_val; + auto ret_val = printf_core::vfprintf_internal(stream, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp index 08d71515646ed..1c0837fd5d441 100644 --- a/libc/src/stdio/generic/vprintf.cpp +++ b/libc/src/stdio/generic/vprintf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/vprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -29,9 +32,19 @@ LLVM_LIBC_FUNCTION(int, vprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - int ret_val = printf_core::vfprintf_internal( + auto ret_val = printf_core::vfprintf_internal( reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args); - return ret_val; + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt index ee66145e60156..2d1daea71406e 100644 --- a/libc/src/stdio/printf_core/CMakeLists.txt +++ b/libc/src/stdio/printf_core/CMakeLists.txt @@ -32,6 +32,17 @@ if(printf_config_copts) list(PREPEND printf_config_copts "COMPILE_OPTIONS") endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) + add_subdirectory(${LIBC_TARGET_OS}) +else() + add_subdirectory(generic) +endif() + +set(target_error_mapper libc.src.stdio.printf_core.${LIBC_TARGET_OS}.error_mapper) +if(NOT TARGET ${target_error_converter}) + set(target_error_mapper libc.src.stdio.printf_core.generic.error_mapper) +endif() + add_header_library( printf_config HDRS @@ -47,6 +58,7 @@ add_header_library( libc.include.inttypes libc.src.__support.CPP.string_view libc.src.__support.FPUtil.fp_bits + libc.hdr.errno_macros ) add_header_library( @@ -125,6 +137,7 @@ add_header_library( .writer .core_structs libc.src.__support.arg_list + libc.src.__support.error_or ) add_header_library( @@ -136,10 +149,20 @@ add_header_library( libc.hdr.func.free libc.hdr.func.realloc libc.src.__support.arg_list + libc.src.__support.error_or libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer ) +add_header_library( + error_mapper + HDRS + error_mapper.h + DEPENDS + ${target_error_mapper} + libc.src.__support.macros.properties.architectures +) + if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD) # Not all platforms have a file implementation. If file is unvailable, and a # full build is requested, then we must skip all file based printf sections. @@ -152,8 +175,10 @@ add_header_library( vfprintf_internal.h DEPENDS libc.src.__support.File.file + libc.src.__support.error_or libc.src.__support.arg_list libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer ${use_system_file} ) + diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h index e27f77b6b594a..0d41f2244d8da 100644 --- a/libc/src/stdio/printf_core/core_structs.h +++ b/libc/src/stdio/printf_core/core_structs.h @@ -132,14 +132,17 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() { // This is the value to be returned by conversions when no error has occurred. constexpr int WRITE_OK = 0; -// These are the printf return values for when an error has occurred. They are -// all negative, and should be distinct. -constexpr int FILE_WRITE_ERROR = -1; -constexpr int FILE_STATUS_ERROR = -2; -constexpr int NULLPTR_WRITE_ERROR = -3; -constexpr int INT_CONVERSION_ERROR = -4; -constexpr int FIXED_POINT_CONVERSION_ERROR = -5; -constexpr int ALLOCATION_ERROR = -6; +// These are the error return values used by the printf engine when an +// error has occurred. They are all large negative, distinct values starting +// from -1000 to not overlap with system errors. +constexpr int FILE_WRITE_ERROR = -1001; +constexpr int FILE_STATUS_ERROR = -1002; +constexpr int NULLPTR_WRITE_ERROR = -1003; +constexpr int INT_CONVERSION_ERROR = -1004; +constexpr int FIXED_POINT_CONVERSION_ERROR = -1005; +constexpr int ALLOCATION_ERROR = -1006; +constexpr int OVERFLOW_ERROR = -1007; + } // namespace printf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/printf_core/error_mapper.h b/libc/src/stdio/printf_core/error_mapper.h new file mode 100644 index 0000000000000..23030930133a1 --- /dev/null +++ b/libc/src/stdio/printf_core/error_mapper.h @@ -0,0 +1,21 @@ +//===-- Error mapper for printf ---------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H + +#include "src/__support/macros/properties/architectures.h" + +// Maps internal errors to the available errnos on the platform. +#if defined(__linux__) +#include "linux/error_mapper.h" +#else +#include "generic/error_mapper.h" +#endif + +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/generic/CMakeLists.txt b/libc/src/stdio/printf_core/generic/CMakeLists.txt new file mode 100644 index 0000000000000..2f0143d992e31 --- /dev/null +++ b/libc/src/stdio/printf_core/generic/CMakeLists.txt @@ -0,0 +1,8 @@ +add_header_library( + error_mapper + HDRS + error_mapper.h + DEPENDS + libc.src.stdio.printf_core.core_structs + libc.hdr.errno_macros +) diff --git a/libc/src/stdio/printf_core/generic/error_mapper.h b/libc/src/stdio/printf_core/generic/error_mapper.h new file mode 100644 index 0000000000000..d8cdd2cc2dbaa --- /dev/null +++ b/libc/src/stdio/printf_core/generic/error_mapper.h @@ -0,0 +1,49 @@ +//===-- Generic implementation of error mapper ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H + +#include "hdr/errno_macros.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" + +namespace LIBC_NAMESPACE_DECL { +namespace printf_core { + +LIBC_INLINE static int internal_error_to_errno(int internal_error) { + // System error occured, return error as is. + if (internal_error < 1001 && internal_error > 0) { + return internal_error; + } + + // Map internal error to the available C standard errnos. + switch (-internal_error) { + case WRITE_OK: + return 0; + case FILE_WRITE_ERROR: + case FILE_STATUS_ERROR: + case NULLPTR_WRITE_ERROR: + case ALLOCATION_ERROR: + return EDOM; + case INT_CONVERSION_ERROR: + case FIXED_POINT_CONVERSION_ERROR: + case OVERFLOW_ERROR: + return ERANGE; + default: + LIBC_ASSERT( + false && + "Invalid internal printf error code passed to internal_error_to_errno"); + return EDOM; + } +} + +} // namespace printf_core +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/linux/CMakeLists.txt b/libc/src/stdio/printf_core/linux/CMakeLists.txt new file mode 100644 index 0000000000000..2f0143d992e31 --- /dev/null +++ b/libc/src/stdio/printf_core/linux/CMakeLists.txt @@ -0,0 +1,8 @@ +add_header_library( + error_mapper + HDRS + error_mapper.h + DEPENDS + libc.src.stdio.printf_core.core_structs + libc.hdr.errno_macros +) diff --git a/libc/src/stdio/printf_core/linux/error_mapper.h b/libc/src/stdio/printf_core/linux/error_mapper.h new file mode 100644 index 0000000000000..3c2fe663072d0 --- /dev/null +++ b/libc/src/stdio/printf_core/linux/error_mapper.h @@ -0,0 +1,54 @@ +//===-- Linux implementation of error mapper --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H + +#include "hdr/errno_macros.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" + +namespace LIBC_NAMESPACE_DECL { +namespace printf_core { + +LIBC_INLINE static int internal_error_to_errno(int internal_error) { + // System error occured, return error as is. + if (internal_error < 1001 && internal_error > 0) { + return internal_error; + } + + // Map internal error to POSIX errnos. + switch (-internal_error) { + case WRITE_OK: + return 0; + case FILE_WRITE_ERROR: + return EIO; + case FILE_STATUS_ERROR: + return EIO; + case NULLPTR_WRITE_ERROR: + return EINVAL; + case INT_CONVERSION_ERROR: + return ERANGE; + case FIXED_POINT_CONVERSION_ERROR: + return EINVAL; + case ALLOCATION_ERROR: + return ENOMEM; + case OVERFLOW_ERROR: + return EOVERFLOW; + default: + LIBC_ASSERT( + false && + "Invalid internal printf error code passed to internal_error_to_errno"); + return EINVAL; + } +} + +} // namespace printf_core +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/printf_main.h b/libc/src/stdio/printf_core/printf_main.h index 57f29858d5298..1c7a7237c097d 100644 --- a/libc/src/stdio/printf_core/printf_main.h +++ b/libc/src/stdio/printf_core/printf_main.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PRINTF_MAIN_H #include "src/__support/arg_list.h" +#include "src/__support/error_or.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/converter.h" #include "src/stdio/printf_core/core_structs.h" @@ -22,8 +23,9 @@ namespace LIBC_NAMESPACE_DECL { namespace printf_core { template <WriteMode write_mode> -int printf_main(Writer<write_mode> *writer, const char *__restrict str, - internal::ArgList &args) { +ErrorOr<size_t> printf_main(Writer<write_mode> *writer, + const char *__restrict str, + internal::ArgList &args) { Parser<internal::ArgList> parser(str, args); int result = 0; for (FormatSection cur_section = parser.get_next_section(); @@ -33,9 +35,8 @@ int printf_main(Writer<write_mode> *writer, const char *__restrict str, result = convert(writer, cur_section); else result = writer->write(cur_section.raw_string); - if (result < 0) - return result; + return Error(-result); } return writer->get_chars_written(); diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h index 283d8df2810fb..41df17b67f35b 100644 --- a/libc/src/stdio/printf_core/vasprintf_internal.h +++ b/libc/src/stdio/printf_core/vasprintf_internal.h @@ -10,6 +10,7 @@ #include "hdr/func/malloc.h" #include "hdr/func/realloc.h" #include "src/__support/arg_list.h" +#include "src/__support/error_or.h" #include "src/stdio/printf_core/core_structs.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -29,7 +30,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { if (new_buff == nullptr) { if (wb->buff != wb->init_buff) free(wb->buff); - return printf_core::ALLOCATION_ERROR; + return ALLOCATION_ERROR; } if (isBuffOnStack) inline_memcpy(new_buff, wb->buff, wb->buff_cur); @@ -42,27 +43,28 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { constexpr size_t DEFAULT_BUFFER_SIZE = 200; -LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format, - internal::ArgList args) { +LIBC_INLINE ErrorOr<size_t> vasprintf_internal(char **ret, + const char *__restrict format, + internal::ArgList args) { char init_buff_on_stack[DEFAULT_BUFFER_SIZE]; printf_core::WriteBuffer<Mode<WriteMode::RESIZE_AND_FILL_BUFF>::value> wb( init_buff_on_stack, DEFAULT_BUFFER_SIZE, resize_overflow_hook); printf_core::Writer writer(wb); auto ret_val = printf_core::printf_main(&writer, format, args); - if (ret_val < 0) { + if (!ret_val.has_value()) { *ret = nullptr; - return -1; + return ret_val; } if (wb.buff == init_buff_on_stack) { - *ret = static_cast<char *>(malloc(ret_val + 1)); + *ret = static_cast<char *>(malloc(ret_val.value() + 1)); if (ret == nullptr) - return printf_core::ALLOCATION_ERROR; - inline_memcpy(*ret, wb.buff, ret_val); + return Error(ALLOCATION_ERROR); + inline_memcpy(*ret, wb.buff, ret_val.value()); } else { *ret = wb.buff; } - (*ret)[ret_val] = '\0'; + (*ret)[ret_val.value()] = '\0'; return ret_val; } } // namespace printf_core diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h index 630de9d9d43dd..564441d3bf51a 100644 --- a/libc/src/stdio/printf_core/vfprintf_internal.h +++ b/libc/src/stdio/printf_core/vfprintf_internal.h @@ -11,6 +11,7 @@ #include "src/__support/File/file.h" #include "src/__support/arg_list.h" +#include "src/__support/error_or.h" #include "src/__support/macros/attributes.h" // For LIBC_INLINE #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" @@ -35,8 +36,8 @@ LIBC_INLINE void funlockfile(FILE *f) { reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock(); } -LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, - FILE *f) { +LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, + size_t nmemb, FILE *f) { return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->write_unlocked( ptr, size * nmemb); } @@ -47,9 +48,11 @@ LIBC_INLINE void flockfile(::FILE *f) { ::flockfile(f); } LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); } -LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, - ::FILE *f) { - return ::fwrite_unlocked(ptr, size, nmemb, f); +LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, + size_t nmemb, ::FILE *f) { + // Need to use system errno in this case, as system write will set this errno + // which we need to propagate back into our code. + return {::fwrite_unlocked(ptr, size, nmemb, f), errno}; } #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace internal @@ -60,26 +63,38 @@ LIBC_INLINE int file_write_hook(cpp::string_view new_str, void *fp) { ::FILE *target_file = reinterpret_cast<::FILE *>(fp); // Write new_str to the target file. The logic preventing a zero-length write // is in the writer, so we don't check here. - size_t written = internal::fwrite_unlocked(new_str.data(), sizeof(char), - new_str.size(), target_file); - if (written != new_str.size() || internal::ferror_unlocked(target_file)) + auto write_result = internal::fwrite_unlocked(new_str.data(), sizeof(char), + new_str.size(), target_file); + // Propagate actual system error in FileIOResult. + if (write_result.has_error()) + return -write_result.error; + + // In case short write occured or error was not set on FileIOResult for some + // reason. + if (write_result.value != new_str.size() || + internal::ferror_unlocked(target_file)) return FILE_WRITE_ERROR; + return WRITE_OK; } -LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream, - const char *__restrict format, - internal::ArgList &args) { +LIBC_INLINE ErrorOr<size_t> vfprintf_internal(::FILE *__restrict stream, + const char *__restrict format, + internal::ArgList &args) { constexpr size_t BUFF_SIZE = 1024; char buffer[BUFF_SIZE]; printf_core::WriteBuffer<Mode<WriteMode::FLUSH_TO_STREAM>::value> wb( buffer, BUFF_SIZE, &file_write_hook, reinterpret_cast<void *>(stream)); Writer writer(wb); internal::flockfile(stream); - int retval = printf_main(&writer, format, args); + auto retval = printf_main(&writer, format, args); + if (!retval.has_value()) { + internal::funlockfile(stream); + return retval; + } int flushval = wb.overflow_write(""); if (flushval != WRITE_OK) - retval = flushval; + retval = Error(-flushval); internal::funlockfile(stream); return retval; } diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h index efcff278bd284..04b2bef05bc7b 100644 --- a/libc/src/stdio/printf_core/write_int_converter.h +++ b/libc/src/stdio/printf_core/write_int_converter.h @@ -29,11 +29,11 @@ LIBC_INLINE int convert_write_int(Writer<write_mode> *writer, return NULLPTR_WRITE_ERROR; #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS - int written = writer->get_chars_written(); + size_t written = writer->get_chars_written(); switch (to_conv.length_modifier) { case LengthModifier::none: - *reinterpret_cast<int *>(to_conv.conv_val_ptr) = written; + *reinterpret_cast<int *>(to_conv.conv_val_ptr) = static_cast<int>(written); break; case LengthModifier::l: *reinterpret_cast<long *>(to_conv.conv_val_ptr) = written; diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h index 1d4734a51b9b8..9de108ece510f 100644 --- a/libc/src/stdio/printf_core/writer.h +++ b/libc/src/stdio/printf_core/writer.h @@ -127,7 +127,7 @@ template <WriteMode write_mode> struct WriteBuffer { template <WriteMode write_mode> class Writer final { WriteBuffer<write_mode> &wb; - int chars_written = 0; + size_t chars_written = 0; LIBC_INLINE int pad(char new_char, size_t length) { // First, fill as much of the buffer as possible with the padding char. @@ -161,7 +161,7 @@ template <WriteMode write_mode> class Writer final { // Takes a string, copies it into the buffer if there is space, else passes it // to the overflow mechanism to be handled separately. LIBC_INLINE int write(cpp::string_view new_string) { - chars_written += static_cast<int>(new_string.size()); + chars_written += new_string.size(); if (LIBC_LIKELY(wb.buff_cur + new_string.size() <= wb.buff_len)) { inline_memcpy(wb.buff + wb.buff_cur, new_string.data(), new_string.size()); @@ -175,7 +175,7 @@ template <WriteMode write_mode> class Writer final { // if there is space, else calls pad which will loop and call the overflow // mechanism on a secondary buffer. LIBC_INLINE int write(char new_char, size_t length) { - chars_written += static_cast<int>(length); + chars_written += length; if (LIBC_LIKELY(wb.buff_cur + length <= wb.buff_len)) { inline_memset(wb.buff + wb.buff_cur, static_cast<unsigned char>(new_char), @@ -199,7 +199,7 @@ template <WriteMode write_mode> class Writer final { return wb.overflow_write(char_string_view); } - LIBC_INLINE int get_chars_written() { return chars_written; } + LIBC_INLINE size_t get_chars_written() { return chars_written; } }; // Class-template auto deduction helpers. diff --git a/libc/src/stdio/snprintf.cpp b/libc/src/stdio/snprintf.cpp index c8940862f711f..206c50d1b41a4 100644 --- a/libc/src/stdio/snprintf.cpp +++ b/libc/src/stdio/snprintf.cpp @@ -8,8 +8,12 @@ #include "src/stdio/snprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -32,10 +36,21 @@ LLVM_LIBC_FUNCTION(int, snprintf, wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/sprintf.cpp b/libc/src/stdio/sprintf.cpp index 7be97d3591aaf..9e9ecdfdf38cc 100644 --- a/libc/src/stdio/sprintf.cpp +++ b/libc/src/stdio/sprintf.cpp @@ -10,7 +10,10 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -33,9 +36,20 @@ LLVM_LIBC_FUNCTION(int, sprintf, wb(buffer, cpp::numeric_limits<size_t>::max()); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp index 4a44d4a0f8842..4bc6a5992d5c8 100644 --- a/libc/src/stdio/vasprintf.cpp +++ b/libc/src/stdio/vasprintf.cpp @@ -7,7 +7,11 @@ //===----------------------------------------------------------------------===// #include "src/stdio/vasprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vasprintf_internal.h" namespace LIBC_NAMESPACE_DECL { @@ -18,7 +22,17 @@ LLVM_LIBC_FUNCTION(int, vasprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - return printf_core::vasprintf_internal(ret, format, args); + auto ret_val = printf_core::vasprintf_internal(ret, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vsnprintf.cpp b/libc/src/stdio/vsnprintf.cpp index b07a2499a0dd3..ba3568fbe78b3 100644 --- a/libc/src/stdio/vsnprintf.cpp +++ b/libc/src/stdio/vsnprintf.cpp @@ -8,8 +8,12 @@ #include "src/stdio/vsnprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -29,10 +33,21 @@ LLVM_LIBC_FUNCTION(int, vsnprintf, wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vsprintf.cpp b/libc/src/stdio/vsprintf.cpp index 26d497be42125..65c223a442d4b 100644 --- a/libc/src/stdio/vsprintf.cpp +++ b/libc/src/stdio/vsprintf.cpp @@ -10,7 +10,10 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -30,9 +33,19 @@ LLVM_LIBC_FUNCTION(int, vsprintf, wb(buffer, cpp::numeric_limits<size_t>::max()); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index c464f82dcbda7..1ccdcc8bec148 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -73,6 +73,8 @@ add_entrypoint_object( strfromf.h DEPENDS .str_from_util + libc.src.__support.CPP.limits + libc.src.stdio.printf_core.error_mapper ) add_entrypoint_object( @@ -83,6 +85,8 @@ add_entrypoint_object( strfromd.h DEPENDS .str_from_util + libc.src.__support.CPP.limits + libc.src.stdio.printf_core.error_mapper ) add_entrypoint_object( @@ -93,6 +97,8 @@ add_entrypoint_object( strfroml.h DEPENDS .str_from_util + libc.src.__support.CPP.limits + libc.src.stdio.printf_core.error_mapper ) add_header_library( diff --git a/libc/src/stdlib/strfromd.cpp b/libc/src/stdlib/strfromd.cpp index f51e6d4c7f1df..f970e22010201 100644 --- a/libc/src/stdlib/strfromd.cpp +++ b/libc/src/stdlib/strfromd.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfromd.h" +#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -36,7 +39,12 @@ LLVM_LIBC_FUNCTION(int, strfromd, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - return writer.get_chars_written(); + if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(writer.get_chars_written()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strfromf.cpp b/libc/src/stdlib/strfromf.cpp index 14dbfdb25bab6..55ede003134b5 100644 --- a/libc/src/stdlib/strfromf.cpp +++ b/libc/src/stdlib/strfromf.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfromf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -36,7 +39,12 @@ LLVM_LIBC_FUNCTION(int, strfromf, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - return writer.get_chars_written(); + if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(writer.get_chars_written()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strfroml.cpp b/libc/src/stdlib/strfroml.cpp index 12f22a8a2fb65..37d18738902bc 100644 --- a/libc/src/stdlib/strfroml.cpp +++ b/libc/src/stdlib/strfroml.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfroml.h" +#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -41,7 +44,12 @@ LLVM_LIBC_FUNCTION(int, strfroml, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - return writer.get_chars_written(); + if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(writer.get_chars_written()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h index c7e590627094a..2b136d83234cd 100644 --- a/libc/src/time/strftime_core/strftime_main.h +++ b/libc/src/time/strftime_core/strftime_main.h @@ -36,7 +36,8 @@ int strftime_main(printf_core::Writer<write_mode> *writer, return result; } - return writer->get_chars_written(); + // TODO: Use ErrorOr<size_t> + return static_cast<int>(writer->get_chars_written()); } } // namespace strftime_core diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index eec108bc12ca5..d71f1dff11943 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -186,6 +186,8 @@ add_libc_test( fprintf_test.cpp DEPENDS libc.src.stdio.fprintf + libc.test.UnitTest.ErrnoCheckingTest + libc.test.UnitTest.ErrnoSetterMatcher ${fprintf_test_deps} COMPILE_OPTIONS ${use_system_file} diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp index 6799323cc6ad9..7d36bd30854b8 100644 --- a/libc/test/src/stdio/fprintf_test.cpp +++ b/libc/test/src/stdio/fprintf_test.cpp @@ -15,6 +15,9 @@ #include "src/stdio/fprintf.h" +#include "src/__support/CPP/limits.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" namespace printf_test { @@ -31,6 +34,8 @@ using ::fread; #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace printf_test +using LlvmLibcFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + TEST(LlvmLibcFPrintfTest, WriteToFile) { const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test"); auto FILE_PATH = libc_make_test_file_path(FILENAME); @@ -78,6 +83,24 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) { written = LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); + ASSERT_ERRNO_EQ(EBADF); + + ASSERT_EQ(printf_test::fclose(file), 0); +} + +#ifndef LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS +TEST(LlvmLibcFPrintfTest, NullPtrCheck) { + const char *FILENAME = APPEND_LIBC_TEST("fprintf_nullptr.test"); + auto FILE_PATH = libc_make_test_file_path(FILENAME); + + ::FILE *file = printf_test::fopen(FILE_PATH, "w"); + ASSERT_FALSE(file == nullptr); + + int ret = + LIBC_NAMESPACE::fprintf(file, "hello %s", static_cast<int *>(nullptr)); + EXPECT_LT(ret, 0); + ASSERT_ERRNO_EQ(EINVAL); ASSERT_EQ(printf_test::fclose(file), 0); } +#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp index bf088937e4104..2dae2a22c864c 100644 --- a/libc/test/src/stdio/printf_core/converter_test.cpp +++ b/libc/test/src/stdio/printf_core/converter_test.cpp @@ -38,7 +38,7 @@ TEST_F(LlvmLibcPrintfConverterTest, SimpleRawConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "abc"); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) { @@ -52,7 +52,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "%"); - ASSERT_EQ(writer.get_chars_written(), 1); + ASSERT_EQ(writer.get_chars_written(), size_t{1}); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) { @@ -70,7 +70,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "D"); - ASSERT_EQ(writer.get_chars_written(), 1); + ASSERT_EQ(writer.get_chars_written(), size_t{1}); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) { @@ -85,7 +85,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, " E"); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) { @@ -102,7 +102,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "F "); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { @@ -118,7 +118,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "DEF"); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { @@ -133,7 +133,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "456"); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) { @@ -148,7 +148,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "xy"); - ASSERT_EQ(writer.get_chars_written(), 2); + ASSERT_EQ(writer.get_chars_written(), size_t{2}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) { @@ -163,7 +163,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, " 789"); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) { @@ -180,7 +180,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "ghi "); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) { @@ -194,7 +194,7 @@ TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "12345"); - ASSERT_EQ(writer.get_chars_written(), 5); + ASSERT_EQ(writer.get_chars_written(), size_t{5}); } TEST_F(LlvmLibcPrintfConverterTest, HexConversion) { @@ -211,7 +211,7 @@ TEST_F(LlvmLibcPrintfConverterTest, HexConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "0x00000000123456ab"); - ASSERT_EQ(writer.get_chars_written(), 18); + ASSERT_EQ(writer.get_chars_written(), size_t{18}); } TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) { @@ -225,7 +225,7 @@ TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "101010"); - ASSERT_EQ(writer.get_chars_written(), 6); + ASSERT_EQ(writer.get_chars_written(), size_t{6}); } TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) { @@ -239,7 +239,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "0x123456ab"); - ASSERT_EQ(writer.get_chars_written(), 10); + ASSERT_EQ(writer.get_chars_written(), size_t{10}); } TEST_F(LlvmLibcPrintfConverterTest, OctConversion) { @@ -253,5 +253,5 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "1234"); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } diff --git a/libc/test/src/stdio/printf_core/writer_test.cpp b/libc/test/src/stdio/printf_core/writer_test.cpp index d036341be7981..d263cf55aa474 100644 --- a/libc/test/src/stdio/printf_core/writer_test.cpp +++ b/libc/test/src/stdio/printf_core/writer_test.cpp @@ -39,7 +39,7 @@ TEST(LlvmLibcPrintfWriterTest, Write) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abc", str); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) { @@ -53,7 +53,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abcDEF123", str); - ASSERT_EQ(writer.get_chars_written(), 9); + ASSERT_EQ(writer.get_chars_written(), size_t{9}); } TEST(LlvmLibcPrintfWriterTest, WriteChars) { @@ -66,7 +66,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteChars) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaa", str); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) { @@ -80,7 +80,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDDD111", str); - ASSERT_EQ(writer.get_chars_written(), 9); + ASSERT_EQ(writer.get_chars_written(), size_t{9}); } TEST(LlvmLibcPrintfWriterTest, WriteManyChars) { @@ -102,7 +102,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteManyChars) { "ZZZZZZZZZZ" "ZZZZZZZZZ", str); - ASSERT_EQ(writer.get_chars_written(), 99); + ASSERT_EQ(writer.get_chars_written(), size_t{99}); } TEST(LlvmLibcPrintfWriterTest, MixedWrites) { @@ -117,7 +117,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWrites) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) { @@ -129,7 +129,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abcDEF1234", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) { @@ -141,7 +141,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("1111111111", str); - ASSERT_EQ(writer.get_chars_written(), 15); + ASSERT_EQ(writer.get_chars_written(), size_t{15}); } TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) { @@ -157,7 +157,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDEF1114", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) { @@ -175,7 +175,7 @@ TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) { @@ -187,7 +187,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) { writer.write('1', 3); writer.write({"456", 3}); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } struct OutBuff { @@ -226,7 +226,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("abcDEF123456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) { @@ -246,7 +246,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("111111111111111", str); - ASSERT_EQ(writer.get_chars_written(), 15); + ASSERT_EQ(writer.get_chars_written(), size_t{15}); } TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) { @@ -269,7 +269,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) { @@ -292,7 +292,7 @@ TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) { @@ -312,7 +312,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) { wb.overflow_write(""); str[out_buff.cur_pos] = '\0'; - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); ASSERT_STREQ("aaaDEF111456", str); } diff --git a/libc/test/src/stdio/snprintf_test.cpp b/libc/test/src/stdio/snprintf_test.cpp index baaa664cdc9ee..1062f952d7429 100644 --- a/libc/test/src/stdio/snprintf_test.cpp +++ b/libc/test/src/stdio/snprintf_test.cpp @@ -8,8 +8,12 @@ #include "src/stdio/snprintf.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" +using LlvmLibcSNPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + // The sprintf test cases cover testing the shared printf functionality, so // these tests will focus on snprintf exclusive features. @@ -59,3 +63,14 @@ TEST(LlvmLibcSNPrintfTest, NoCutOff) { EXPECT_EQ(written, 10); ASSERT_STREQ(buff, "1234567890"); } + +TEST(LlvmLibcSNPrintfTest, CharsWrittenOverflow) { + char buff[0]; + + // Trigger an overflow in the return value of snprintf by writing more than + // INT_MAX bytes. + int int_max = LIBC_NAMESPACE::cpp::numeric_limits<int>::max(); + int written = LIBC_NAMESPACE::snprintf(buff, 0, "%*stest", int_max, ""); + EXPECT_LT(written, 0); + ASSERT_ERRNO_EQ(EOVERFLOW); +} diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp index f50565a0f68ca..9b5f09db8fd41 100644 --- a/libc/test/src/stdio/vfprintf_test.cpp +++ b/libc/test/src/stdio/vfprintf_test.cpp @@ -19,6 +19,8 @@ #include "src/stdio/vfprintf.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" namespace printf_test { @@ -44,6 +46,8 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format, return ret; } +using LlvmLibcVFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + TEST(LlvmLibcVFPrintfTest, WriteToFile) { const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test"); auto FILE_PATH = libc_make_test_file_path(FILENAME); @@ -90,6 +94,7 @@ TEST(LlvmLibcVFPrintfTest, WriteToFile) { written = call_vfprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); + ASSERT_ERRNO_EQ(EBADF); ASSERT_EQ(printf_test::fclose(file), 0); } diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h index e82c94499aa11..fdeed0e3c06f5 100644 --- a/libc/test/src/stdlib/StrfromTest.h +++ b/libc/test/src/stdlib/StrfromTest.h @@ -8,6 +8,8 @@ #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/FPBits.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #define ASSERT_STREQ_LEN(actual_written, actual_str, expected_str) \ @@ -15,7 +17,7 @@ EXPECT_STREQ(actual_str, expected_str); template <typename InputT> -class StrfromTest : public LIBC_NAMESPACE::testing::Test { +class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { static constexpr bool is_single_prec = LIBC_NAMESPACE::cpp::is_same<InputT, float>::value; @@ -481,6 +483,16 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test { written = func(buff, 10, "%A", -ld_nan); ASSERT_STREQ_LEN(written, buff, "-NAN"); } + + void charsWrittenOverflow(FunctionT func) { + char buff[100]; + // Trigger an overflow in the return value of strfrom by writing more than + // INT_MAX bytes. + int result = func(buff, sizeof(buff), "%.2147483647f", 1.0f); + + EXPECT_LT(result, 0); + ASSERT_ERRNO_EQ(EOVERFLOW); + } }; #define STRFROM_TEST(InputType, name, func) \ @@ -501,4 +513,7 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test { TEST_F(LlvmLibc##name##Test, InsufficientBufferSize) { \ insufficentBufsize(func); \ } \ - TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); } + TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); } \ + TEST_F(LlvmLibc##name##Test, CharsWrittenOverflow) { \ + charsWrittenOverflow(func); \ + } From ca69a8d2f403de3617970dcfa2f84756f7f336dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de> Date: Mon, 3 Nov 2025 20:19:34 +0100 Subject: [PATCH 058/313] [clang-format] Fix ColumnLimit violation while aligning (#165627) It did compute the length only on the first line, and thus the following lines could be (and in the test example were) moved over the column limit, when the = was aligned. --- clang/lib/Format/WhitespaceManager.cpp | 3 ++- clang/unittests/Format/FormatTest.cpp | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp index f24b8ab14bdce..406c77cb3ae8f 100644 --- a/clang/lib/Format/WhitespaceManager.cpp +++ b/clang/lib/Format/WhitespaceManager.cpp @@ -591,7 +591,8 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, CurrentChangeWidthRight = CurrentChange.TokenLength; const FormatToken *MatchingParenToEncounter = nullptr; for (unsigned J = I + 1; - J != E && (Changes[J].NewlinesBefore == 0 || MatchingParenToEncounter); + J != E && (Changes[J].NewlinesBefore == 0 || + MatchingParenToEncounter || Changes[J].IsAligned); ++J) { const auto &Change = Changes[J]; const auto *Tok = Change.Tok; diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index ca9e7925e5e95..24235b966399d 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -20824,6 +20824,13 @@ TEST_F(FormatTest, AlignWithLineBreaks) { " argument1,\n" " argument2);", Style); + + Style.ColumnLimit = 45; + verifyFormat("auto xxxxxxxx = foo;\n" + "auto x = whatever ? some / long -\n" + " computition / stuff\n" + " : random;", + Style); } TEST_F(FormatTest, AlignWithInitializerPeriods) { From dd45c060ffe05eca991b8ed01a71d09edeebbc94 Mon Sep 17 00:00:00 2001 From: Amr Hesham <amr96@programmer.net> Date: Mon, 3 Nov 2025 20:54:02 +0100 Subject: [PATCH 059/313] [clang][NFC] Fix BuildExtVectorType parameter name (#166208) Fix the parameter name in the BuildExtVectorType function, also updating the code style to be consistent with BuildVectorType Discovered in #166055 --- clang/lib/Sema/SemaType.cpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 682fd258eccf2..c483930705057 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -2399,7 +2399,7 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr, VectorKind::Generic); } -QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize, +QualType Sema::BuildExtVectorType(QualType T, Expr *SizeExpr, SourceLocation AttrLoc) { // Unlike gcc's vector_size attribute, we do not allow vectors to be defined // in conjunction with complex types (pointers, arrays, functions, etc.). @@ -2422,40 +2422,40 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize, BIT && CheckBitIntElementType(*this, AttrLoc, BIT)) return QualType(); - if (!ArraySize->isTypeDependent() && !ArraySize->isValueDependent()) { - std::optional<llvm::APSInt> vecSize = - ArraySize->getIntegerConstantExpr(Context); - if (!vecSize) { + if (!SizeExpr->isTypeDependent() && !SizeExpr->isValueDependent()) { + std::optional<llvm::APSInt> VecSize = + SizeExpr->getIntegerConstantExpr(Context); + if (!VecSize) { Diag(AttrLoc, diag::err_attribute_argument_type) - << "ext_vector_type" << AANT_ArgumentIntegerConstant - << ArraySize->getSourceRange(); + << "ext_vector_type" << AANT_ArgumentIntegerConstant + << SizeExpr->getSourceRange(); return QualType(); } - if (vecSize->isNegative()) { - Diag(ArraySize->getExprLoc(), diag::err_attribute_vec_negative_size); + if (VecSize->isNegative()) { + Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size); return QualType(); } - if (!vecSize->isIntN(32)) { + if (!VecSize->isIntN(32)) { Diag(AttrLoc, diag::err_attribute_size_too_large) - << ArraySize->getSourceRange() << "vector"; + << SizeExpr->getSourceRange() << "vector"; return QualType(); } // Unlike gcc's vector_size attribute, the size is specified as the // number of elements, not the number of bytes. - unsigned vectorSize = static_cast<unsigned>(vecSize->getZExtValue()); + unsigned VectorSize = static_cast<unsigned>(VecSize->getZExtValue()); - if (vectorSize == 0) { + if (VectorSize == 0) { Diag(AttrLoc, diag::err_attribute_zero_size) - << ArraySize->getSourceRange() << "vector"; + << SizeExpr->getSourceRange() << "vector"; return QualType(); } - return Context.getExtVectorType(T, vectorSize); + return Context.getExtVectorType(T, VectorSize); } - return Context.getDependentSizedExtVectorType(T, ArraySize, AttrLoc); + return Context.getDependentSizedExtVectorType(T, SizeExpr, AttrLoc); } QualType Sema::BuildMatrixType(QualType ElementTy, Expr *NumRows, Expr *NumCols, From a8ea7f4580b467183ce2075db6b1b2ec3beb6ebf Mon Sep 17 00:00:00 2001 From: Robert Imschweiler <robert.imschweiler@amd.com> Date: Mon, 3 Nov 2025 20:59:48 +0100 Subject: [PATCH 060/313] Reapply: [AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for callbr instruction with inline-asm (#152161) (#166195) Reapply #152161 with fixed 'changed' flags. --- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 90 +++--- llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 19 +- llvm/lib/Transforms/Utils/UnifyLoopExits.cpp | 6 + llvm/test/CodeGen/AMDGPU/callbr.ll | 54 ++++ ...nify-divergent-exit-nodes-with-musttail.ll | 51 ++++ llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 257 ++++++++++++++++-- .../si-annotate-nested-control-flows.ll | 100 ++++++- .../si-unify-exit-multiple-unreachables.ll | 161 ++++++++++- llvm/test/CodeGen/AMDGPU/update-phi.ll | 39 +++ llvm/test/Transforms/StructurizeCFG/callbr.ll | 235 ++++++++++++++++ 10 files changed, 933 insertions(+), 79 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/callbr.ll create mode 100644 llvm/test/Transforms/StructurizeCFG/callbr.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index ddf9a24eb5230..fe81a5efd9d51 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( return NewRetBlock; } +static BasicBlock * +createDummyReturnBlock(Function &F, + SmallVector<BasicBlock *, 4> &ReturningBlocks) { + BasicBlock *DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + return DummyReturnBB; +} + +/// Handle conditional branch instructions (-> 2 targets) and callbr +/// instructions with N targets. +static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI, + BasicBlock *DummyReturnBB, + std::vector<DominatorTree::UpdateType> &Updates) { + SmallVector<BasicBlock *, 2> Successors(successors(BB)); + + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); + + Updates.reserve(Updates.size() + 2 * Successors.size() + 2); + + // 'Successors' become successors of TransitionBB instead of BB, + // and TransitionBB becomes a single successor of BB. + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); + for (BasicBlock *Successor : Successors) { + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); + } + + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); + BranchInst::Create(TransitionBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); +} + bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - if (PDT.root_size() == 0 || (PDT.root_size() == 1 && - !isa<BranchInst>(PDT.getRoot()->getTerminator()))) + !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator()))) return false; // Loop over all of the blocks in a function, tracking all of the blocks that @@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, if (HasDivergentExitBlock) UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - - ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); - if (DummyReturnBB == nullptr) { - DummyReturnBB = - BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); - Type *RetTy = F.getReturnType(); - Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); - ReturningBlocks.push_back(DummyReturnBB); - } + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); if (BI->isUnconditional()) { BasicBlock *LoopHeaderBB = BI->getSuccessor(0); BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. - BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); - Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); - } else { // Conditional branch. - SmallVector<BasicBlock *, 2> Successors(successors(BB)); - - // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - - Updates.reserve(Updates.size() + 2 * Successors.size() + 2); - - // 'Successors' become successors of TransitionBB instead of BB, - // and TransitionBB becomes a single successor of BB. - Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); - for (BasicBlock *Successor : Successors) { - Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); - Updates.emplace_back(DominatorTree::Delete, BB, Successor); - } - - // Create a branch that will always branch to the transition block and - // references DummyReturnBB. - BB->getTerminator()->eraseFromParent(); - BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); + BranchInst::Create(LoopHeaderBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); + } else { + handleNBranch(F, BB, BI, DummyReturnBB, Updates); } Changed = true; + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) { + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); + + handleNBranch(F, BB, CBI, DummyReturnBB, Updates); + Changed = true; + } else { + llvm_unreachable("unsupported block terminator"); } } diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 5f6f66a4bc213..0a8f5ea2fdae1 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { } else { // Test for successors as back edge BasicBlock *BB = N->getNodeAs<BasicBlock>(); - BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - - for (BasicBlock *Succ : Term->successors()) - if (Visited.count(Succ)) - Loops[Succ] = BB; + if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator())) + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) + Loops[Succ] = BB; } } @@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { for (BasicBlock *P : predecessors(BB)) { // Ignore it if it's a branch from outside into our region entry - if (!ParentRegion->contains(P)) + if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator())) continue; Region *R = RI->getRegionFor(P); @@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { /// Run the transformation for each region found bool StructurizeCFG::run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI) { - if (R->isTopLevelRegion()) + // CallBr and its corresponding direct target blocks are for now ignored by + // this pass. This is not a limitation for the currently intended uses cases + // of callbr in the AMDGPU backend. + // Parent and child regions are not affected by this (current) restriction. + // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details. + if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator())) return false; this->DT = DT; this->TTI = TTI; Func = R->getEntry()->getParent(); - assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); ParentRegion = R; diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 94c5c1709f43e..e86ab13094b15 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix; // Redirect exiting edges through a control flow hub. ControlFlowHub CHub; + bool Changed = false; for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { BasicBlock *BB = ExitingBlocks[I]; @@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { bool UpdatedLI = false; BasicBlock *NewSucc = SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); + // SplitCallBrEdge modifies the CFG because it creates an intermediate + // block. So we need to set the changed flag no matter what the + // ControlFlowHub is going to do later. + Changed = true; // Even if CallBr and Succ do not have a common parent loop, we need to // add the new target block to the parent loop of the current loop. if (!UpdatedLI) @@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { bool ChangedCFG; std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue()); + ChangedCFG |= Changed; if (!ChangedCFG) return false; diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll new file mode 100644 index 0000000000000..253a6ec100eae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/callbr.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s + +define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) { +; CHECK-LABEL: callbr_inline_asm: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dword v0, v[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %fallthrough +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[2:3], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target +; CHECK-NEXT: ; %indirect +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[4:5], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %a = load i32, ptr %src, align 4 + callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect] +fallthrough: + store i32 %a, ptr %dst1, align 4 + br label %ret +indirect: + store i32 %a, ptr %dst2, align 4 + br label %ret +ret: + ret void +} + +define void @callbr_self_loop(i1 %c) { +; CHECK-LABEL: callbr_self_loop: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: .LBB1_1: ; %callbr +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_1 +; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target +; CHECK-NEXT: ; %callbr.target.ret +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_setpc_b64 s[30:31] + br label %callbr +callbr: + callbr void asm "", "!i"() to label %callbr [label %ret] +ret: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll index 007e3f0a6bdbc..076a99ff8588f 100644 --- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll +++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll @@ -3,6 +3,7 @@ declare void @foo(ptr) declare i1 @bar(ptr) +declare i32 @bar32(ptr) define void @musttail_call_without_return_value(ptr %p) { ; CHECK-LABEL: define void @musttail_call_without_return_value( @@ -28,6 +29,31 @@ bb.1: ret void } +define void @musttail_call_without_return_value_callbr(ptr %p) { +; CHECK-LABEL: define void @musttail_call_without_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: musttail call void @foo(ptr [[P]]) +; CHECK-NEXT: ret void +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret void +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + musttail call void @foo(ptr %p) + ret void + +bb.1: + ret void +} + define i1 @musttail_call_with_return_value(ptr %p) { ; CHECK-LABEL: define i1 @musttail_call_with_return_value( ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { @@ -51,3 +77,28 @@ bb.0: bb.1: ret i1 %load } + +define i32 @musttail_call_with_return_value_callbr(ptr %p) { +; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]]) +; CHECK-NEXT: ret i32 [[RET]] +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret i32 [[LOAD]] +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + %ret = musttail call i32 @bar32(ptr %p) + ret i32 %ret + +bb.1: + ret i32 %load +} diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 3e2e43faca5aa..df635925b87df 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -36,26 +36,60 @@ loop: br label %loop } +define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP:%.*]] [] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", ""() to label %loop [] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] +} + define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB1_3 +; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB1_2: ; %loop +; SI-NEXT: .LBB2_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB1_2 -; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock +; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_ret( ; IR-NEXT: entry: @@ -81,44 +115,93 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop.preheader +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: .LBB3_2: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1 +; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]]) +; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond = icmp eq i32 %tmp, 1 + %cond32 = zext i1 %cond to i32 + callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] + +return: + ret void +} + define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: s_cbranch_scc1 .LBB2_4 +; SI-NEXT: s_cbranch_scc1 .LBB4_4 ; SI-NEXT: ; %bb.1: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x378 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB2_2: ; %loop2 +; SI-NEXT: .LBB4_2: ; %loop2 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: s_cbranch_vccnz .LBB4_2 ; SI-NEXT: ; %bb.3: ; %Flow ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB2_4: ; %Flow2 +; SI-NEXT: .LBB4_4: ; %Flow2 ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_7 +; SI-NEXT: s_cbranch_vccz .LBB4_7 ; SI-NEXT: ; %bb.5: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, 0 -; SI-NEXT: .LBB2_6: ; %loop1 +; SI-NEXT: .LBB4_6: ; %loop1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_6 -; SI-NEXT: .LBB2_7: ; %DummyReturnBlock +; SI-NEXT: s_cbranch_vccz .LBB4_6 +; SI-NEXT: .LBB4_7: ; %DummyReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loops( ; IR-NEXT: entry: @@ -144,24 +227,78 @@ loop2: br label %loop2 } +define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loops_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB5_2: ; Inline asm indirect target +; SI-NEXT: ; %loop2.preheader +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x378 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loops_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", "r,!i"(i32 poison) +; IR-NEXT: to label [[LOOP1:%.*]] [label %loop2] +; IR: loop1: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP1]] [] +; IR: loop2: +; IR-NEXT: store volatile i32 888, ptr addrspace(1) [[OUT]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]] +; IR: TransitionBlock1: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP2:%.*]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2] + +loop1: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop1 [] + +loop2: + store volatile i32 888, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop2 [] +} + define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_nest_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB3_5 +; SI-NEXT: s_cbranch_execz .LBB6_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT: .LBB3_2: ; %outer_loop +; SI-NEXT: .LBB6_2: ; %outer_loop ; SI-NEXT: ; =>This Loop Header: Depth=1 -; SI-NEXT: ; Child Loop BB3_3 Depth 2 +; SI-NEXT: ; Child Loop BB6_3 Depth 2 ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB3_3: ; %inner_loop -; SI-NEXT: ; Parent Loop BB3_2 Depth=1 +; SI-NEXT: .LBB6_3: ; %inner_loop +; SI-NEXT: ; Parent Loop BB6_2 Depth=1 ; SI-NEXT: ; => This Inner Loop Header: Depth=2 ; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1] ; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] @@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: s_cbranch_execnz .LBB6_3 ; SI-NEXT: ; %bb.4: ; %loop.exit.guard -; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1 +; SI-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: s_branch .LBB3_2 -; SI-NEXT: .LBB3_5: ; %UnifiedReturnBlock +; SI-NEXT: s_branch .LBB6_2 +; SI-NEXT: .LBB6_5: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_nest_ret( ; IR-NEXT: entry: @@ -212,4 +349,82 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_nest_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %outer_loop.preheader +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_and_b64 s[0:1], exec, 0 +; SI-NEXT: s_branch .LBB7_3 +; SI-NEXT: .LBB7_2: ; %loop.exit.guard +; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccnz .LBB7_5 +; SI-NEXT: .LBB7_3: ; %outer_loop +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: s_mov_b64 vcc, s[0:1] +; SI-NEXT: s_cbranch_vccz .LBB7_2 +; SI-NEXT: ; %bb.4: ; %TransitionBlock.target.outer_loop +; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_branch .LBB7_2 +; SI-NEXT: .LBB7_5: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_nest_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND1:%.*]] = icmp ne i32 [[TMP]], 1 +; IR-NEXT: [[COND1_32:%.*]] = zext i1 [[COND1]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND1_32]]) +; IR-NEXT: to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: outer_loop: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[INNER_LOOP:%.*]] [] +; IR: inner_loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: [[COND3:%.*]] = icmp eq i32 [[TMP]], 3 +; IR-NEXT: [[COND3_32:%.*]] = zext i1 [[COND3]] to i32 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND3_32]]) +; IR-NEXT: to label [[INNER_LOOP]] [label %outer_loop] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination + %cond1_32 = zext i1 %cond1 to i32 + callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return] + +outer_loop: + ; %cond2 = icmp eq i32 %tmp, 2 + ; br i1 %cond2, label %outer_loop, label %inner_loop + callbr void asm "", ""() to label %inner_loop [] + +inner_loop: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 999, ptr addrspace(1) %out, align 4 + %cond3 = icmp eq i32 %tmp, 3 + %cond3_32 = zext i1 %cond3 to i32 + callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop] + +return: + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 34de1e48bfb59..01bcdad3fc220 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -3,15 +3,16 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA define void @nested_inf_loop(i1 %0, i1 %1) { -; OPT-LABEL: @nested_inf_loop( -; OPT-NEXT: BB: -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: BB1: -; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]] -; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]] -; OPT: infloop: -; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]] -; OPT: DummyReturnBlock: +; OPT-LABEL: define void @nested_inf_loop( +; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]] +; OPT-NEXT: br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]] +; OPT: [[INFLOOP]]: +; OPT-NEXT: br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT: [[DUMMYRETURNBLOCK]]: ; OPT-NEXT: ret void ; ; ISA-LABEL: nested_inf_loop: @@ -63,3 +64,84 @@ BB4: BB3: br label %BB1 } + +define void @nested_inf_loop_callbr(i32 %0, i32 %1) { +; OPT-LABEL: define void @nested_inf_loop_callbr( +; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB1:.*]] [] +; OPT: [[BB1]]: +; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP0]]) +; OPT-NEXT: to label %[[BB3:.*]] [label %BB2] +; OPT: [[BB2:.*:]] +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB4:.*]] [] +; OPT: [[BB4]]: +; OPT-NEXT: br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT: [[TRANSITIONBLOCK]]: +; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP1]]) +; OPT-NEXT: to label %[[BB3]] [label %BB4] +; OPT: [[BB3]]: +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB1]] [] +; OPT: [[DUMMYRETURNBLOCK]]: +; OPT-NEXT: ret void +; +; ISA-LABEL: nested_inf_loop_callbr: +; ISA: ; %bb.0: ; %BB +; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: ; implicit-def: $sgpr6_sgpr7 +; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5 +; ISA-NEXT: .LBB1_1: ; %BB1 +; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; ISA-NEXT: s_and_b64 s[8:9], s[4:5], exec +; ISA-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; ISA-NEXT: .LBB1_2: ; %BB3 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; ISA-NEXT: s_and_b64 s[8:9], s[6:7], exec +; ISA-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; ISA-NEXT: s_branch .LBB1_1 +; ISA-NEXT: .LBB1_3: ; Inline asm indirect target +; ISA-NEXT: ; %BB2 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: ; Label of block must be emitted +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_mov_b64 s[6:7], -1 +; ISA-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; ISA-NEXT: s_cbranch_execz .LBB1_5 +; ISA-NEXT: ; %bb.4: ; %TransitionBlock.target.BB3 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: s_xor_b64 s[6:7], exec, -1 +; ISA-NEXT: .LBB1_5: ; %loop.exit.guard +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_and_b64 vcc, exec, s[6:7] +; ISA-NEXT: s_mov_b64 s[6:7], 0 +; ISA-NEXT: s_cbranch_vccz .LBB1_2 +; ISA-NEXT: ; %bb.6: ; %DummyReturnBlock +; ISA-NEXT: s_setpc_b64 s[30:31] +BB: + callbr void asm "", ""() to label %BB1 [] + +BB1: + callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2] + +BB2: + callbr void asm "", ""() to label %BB4 [] + +BB4: + callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4] + +BB3: + callbr void asm "", ""() to label %BB1 [] +} diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 4cbe682cf9f9f..004c27971131d 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s declare void @llvm.trap() @@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 - - +; UNIFY-LABEL: @kernel( +; UNIFY-NEXT: entry: +; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; UNIFY: if.then: +; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT: br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]] +; UNIFY: cond.false: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.else: +; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT: br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]] +; UNIFY: if.then3: +; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT: br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]] +; UNIFY: cond.false.i8: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.end6.sink.split: +; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT: br label [[IF_END6]] +; UNIFY: if.end6: +; UNIFY-NEXT: ret void +; entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 @@ -105,5 +130,129 @@ if.end6.sink.split: if.end6: ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; UNIFY: {{.*}} + +define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { +; CHECK-LABEL: kernel_callbr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s1, s[8:9], 0x10 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmpk_eq_i32 s1, 0x100 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %if.then +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: .LBB1_2: ; %if.end6.sink.split +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dword v0, v1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: .LBB1_3: ; Inline asm indirect target +; CHECK-NEXT: ; %UnifiedReturnBlock +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB1_4: ; Inline asm indirect target +; CHECK-NEXT: ; %if.else +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.5: ; %if.then3 +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_2 +; CHECK-NEXT: .LBB1_6: ; Inline asm indirect target +; CHECK-NEXT: ; %cond.false.i8 +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: .LBB1_7: ; Inline asm indirect target +; CHECK-NEXT: ; %cond.false +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_trap 2 +; CHECK-NEXT: ; divergent unreachable +; CHECK-NEXT: s_branch .LBB1_3 +; UNIFY-LABEL: @kernel_callbr( +; UNIFY-NEXT: entry: +; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT: [[CMP32:%.*]] = zext i1 [[CMP]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP32]]) +; UNIFY-NEXT: to label [[IF_THEN:%.*]] [label %if.else] +; UNIFY: if.then: +; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT: [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_32]]) +; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false] +; UNIFY: cond.false: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.else: +; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT: [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP2_32]]) +; UNIFY-NEXT: to label [[IF_THEN3:%.*]] [label %if.end6] +; UNIFY: if.then3: +; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT: [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]]) +; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8] +; UNIFY: cond.false.i8: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.end6.sink.split: +; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT: callbr void asm "", ""() +; UNIFY-NEXT: to label [[IF_END6:%.*]] [] +; UNIFY: if.end6: +; UNIFY-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cmp = icmp eq i32 %n, 256 + %cmp32 = zext i1 %cmp to i32 + callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else] + +if.then: + %cmp1 = icmp eq i32 %a, 0 + %cmp1_32 = zext i1 %cmp1 to i32 + callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false] + +cond.false: + call void @llvm.trap() + unreachable + +if.else: + %cmp2 = icmp ult i32 %tid, 10 + %cmp2_32 = zext i1 %cmp2 to i32 + callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6] + +if.then3: + %cmp1.i7 = icmp eq i32 %a, 0 + %cmp1.i7_32 = zext i1 %cmp1.i7 to i32 + callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8] + +cond.false.i8: + call void @llvm.trap() + unreachable + +if.end6.sink.split: + %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid + store i32 %a, ptr addrspace(1) %x1, align 4 + callbr void asm "", ""() to label %if.end6 [] + +if.end6: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll index 50666bee325e8..684dc1a1f0092 100644 --- a/llvm/test/CodeGen/AMDGPU/update-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll @@ -37,3 +37,42 @@ n28: ; preds = %.loopexit, %n28 n31: ; preds = ret void } + +define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 { +; IR-LABEL: @_amdgpu_ps_main_callbr( +; IR-NEXT: .entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[DOTLOOPEXIT:%.*]] [] +; IR: .loopexit: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[N28:%.*]] [] +; IR: n28: +; IR-NEXT: [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ] +; IR-NEXT: [[N29]] = fadd float [[DOT01]], 1.000000e+00 +; IR-NEXT: [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00 +; IR-NEXT: [[N30_32:%.*]] = zext i1 [[N30]] to i32 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[N30_32]]) +; IR-NEXT: to label [[DOTLOOPEXIT]] [label %n28] +; IR: n31: +; IR-NEXT: ret void +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +.entry: + callbr void asm "", ""() to label %.loopexit [] + +.loopexit: ; preds = %n28, %.entry + callbr void asm "", ""() to label %n28 [] + +n28: ; preds = %.loopexit, %n28 + %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ] + %n29 = fadd float %.01, 1.0 + %n30 = fcmp ogt float %n29, 4.000000e+00 + %n30.32 = zext i1 %n30 to i32 + callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28] + +n31: ; preds = + ret void +} diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll new file mode 100644 index 0000000000000..42f95194980d4 --- /dev/null +++ b/llvm/test/Transforms/StructurizeCFG/callbr.ll @@ -0,0 +1,235 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s + +; Structurize as usual, but don't tear callbr and its destination blocks apart. +; +; Note: currently, callbr blocks and their corresponding target blocks +; themselves are not handled by the structurizer.* If the CFG turns out to be +; unstructured at the end, the CFG lowering (si-annotate-control-flow) will +; detect this. For the currently intended use cases of callbr in the context of +; the AMDGPU backend, this is not a limitation (cf. +; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087). +; +; Note 2: while callbr and its targets remain untouched, everything else is +; handled as usual, even if it is nested in a callbr region. +; +; *FIXME: this will be fixed in the future. Callbr can be handled as follows: +; Input IR: +; ``` +; define void @foo_callbr() { +; callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...] +; fallthrough: +; br label %exit +; indirect: +; br label %exit +; ... +; exit: +; ret void +; } +; ``` +; +; Output IR: +; ``` +; define void @foo_callbr() { +; callbr void asm "", "!i"() +; to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...] +; fake.indirect: ; preds = %0 +; br label %Flow +; fake.indirect1: ; preds = %0 +; br label %Flow +; fake.indirect2: ; preds = %0 +; br label %Flow +; ... +; Flow: ; preds = %fallthrough, %fake.indirect[0-N] +; %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ] +; br i1 %1, label %indirect, label %Flow1 +; Flow1: ; preds = %Flow, %indirect +; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ] +; br i1 %2, label %indirect1, label %Flow2 +; Flow2: ; preds = %Flow, %indirect1 +; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ] +; br i1 %2, label %indirect2, label %Flow3 +; ... +; fallthrough: ; preds = %0 +; br label %Flow +; indirect: ; preds = %Flow +; br label %Flow1 +; indirect1: ; preds = %Flow1 +; br label %Flow2 +; indirect2: : preds = %Flow2 +; br label %Flow3 +; ... +; exit: ; preds = %indirectN, %FlowN +; ret void +; } +; ``` +; +; Output IR as ASCII-art: +; %0 +; --------------------- +; | | | | +; v v v v +; f f.i f.i1 f.i2 +; | | | | +; v v v v +; --------------------- +; %Flow +; | \ +; | %indirect +; | / +; %Flow1 +; | \ +; | %indirect1 +; | / +; %Flow2 +; | \ +; | %indirect2 +; | / +; %exit +; + +; Only callbr, nothing to do. +define void @callbr_simple() { +; CHECK-LABEL: define void @callbr_simple() { +; CHECK-NEXT: [[CALLBR:.*:]] +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br label %exit +indirect: + br label %exit +exit: + ret void +} + +; Callbr nested in non-callbr: non-callbr is transformed +define void @callbr_in_non_callbr(i1 %c) { +; CHECK-LABEL: define void @callbr_in_non_callbr( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]] +; CHECK: [[CALLBR]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[NOCALLBR]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + br i1 %c, label %callbr, label %nocallbr +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br label %exit +indirect: + br label %exit +nocallbr: + br label %exit +exit: + ret void +} + +; Callbr parent of non-callbr: non-callbr is transformed +define void @non_callbr_in_callbr(i1 %c) { +; CHECK-LABEL: define void @non_callbr_in_callbr( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]] +; CHECK: [[FALLTHROUGH1]]: +; CHECK-NEXT: br label %[[FLOW1]] +; CHECK: [[FALLTHROUGH2]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[FLOW1]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %c, label %fallthrough1, label %fallthrough2 +fallthrough1: + br label %exit +fallthrough2: + br label %exit +indirect: + br label %exit +exit: + ret void +} + +; Callbr surrounded by non-callbr: all three regular branches are handled +; correctly +define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) { +; CHECK-LABEL: define void @callbr_nested_in_non_callbr( +; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]] +; CHECK: [[FLOW3]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]] +; CHECK: [[CALLBR]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]] +; CHECK: [[FALLTHROUGH1]]: +; CHECK-NEXT: br label %[[FLOW2]] +; CHECK: [[INDIRECT2:.*:]] +; CHECK-NEXT: br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]] +; CHECK: [[INDIRECT1]]: +; CHECK-NEXT: br label %[[FLOW1]] +; CHECK: [[NOCALLBR]]: +; CHECK-NEXT: br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]] +; CHECK: [[NOCALLBR1]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: br label %[[FLOW3]] +; CHECK: [[FLOW1]]: +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[FLOW2]]: +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; + br i1 %c, label %callbr, label %nocallbr +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %d, label %fallthrough1, label %ret +fallthrough1: + br label %ret +indirect: + br i1 %e, label %indirect1, label %ret +indirect1: + br label %ret +nocallbr: + br i1 %f, label %nocallbr1, label %ret +nocallbr1: + br label %ret +ret: + ret void +} From 15bbdd143cb196be8d60ea4dc813bf7bc5e4650c Mon Sep 17 00:00:00 2001 From: Charles Zablit <c_zablit@apple.com> Date: Mon, 3 Nov 2025 20:00:16 +0000 Subject: [PATCH 061/313] [lldb][windows] print an error if python.dll is not in the DLL search path (#164893) This is a follow up to https://github.com/llvm/llvm-project/pull/162509. Using the `SearchPathW` API, we can ensure that the correct version of Python is installed before `liblldb` is loaded (and `python.dll` subsequently). If it's not, we try to add it to the search path with the methods introduced in https://github.com/llvm/llvm-project/pull/162509. If that fails or if that method is `#ifdef`'d out, we print an error which will appear before lldb crashes due to the missing dll. Before https://github.com/llvm/llvm-project/pull/162509, when invoked from Powershell, lldb would silently crash (no error message/crash report). After https://github.com/llvm/llvm-project/pull/162509, it crashes without any indications that the root cause is the missing python.dll. With this patch, we print the error before crashing. --- lldb/CMakeLists.txt | 6 +++ lldb/tools/driver/CMakeLists.txt | 3 ++ lldb/tools/driver/Driver.cpp | 68 ++++++++++++++++++++++++++------ 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt index e3b72e94d4beb..01b5546fee00d 100644 --- a/lldb/CMakeLists.txt +++ b/lldb/CMakeLists.txt @@ -87,6 +87,12 @@ if (LLDB_ENABLE_PYTHON) set(LLDB_PYTHON_EXT_SUFFIX "_d${LLDB_PYTHON_EXT_SUFFIX}") endif() endif() + if(TARGET Python3::Python) + get_target_property(_Python3_LIB_PATH Python3::Python IMPORTED_LIBRARY_LOCATION) + if(_Python3_LIB_PATH) + get_filename_component(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME "${_Python3_LIB_PATH}" NAME) + endif() + endif() endif () if (LLDB_ENABLE_LUA) diff --git a/lldb/tools/driver/CMakeLists.txt b/lldb/tools/driver/CMakeLists.txt index 67956af7fe3fb..efe51506f3545 100644 --- a/lldb/tools/driver/CMakeLists.txt +++ b/lldb/tools/driver/CMakeLists.txt @@ -37,6 +37,9 @@ add_dependencies(lldb if(DEFINED LLDB_PYTHON_DLL_RELATIVE_PATH) target_compile_definitions(lldb PRIVATE LLDB_PYTHON_DLL_RELATIVE_PATH="${LLDB_PYTHON_DLL_RELATIVE_PATH}") endif() +if(DEFINED LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME) + target_compile_definitions(lldb PRIVATE LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME="${LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME}") +endif() if(LLDB_BUILD_FRAMEWORK) # In the build-tree, we know the exact path to the framework directory. diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp index 733331f4ddac0..bebf1a70d50e9 100644 --- a/lldb/tools/driver/Driver.cpp +++ b/lldb/tools/driver/Driver.cpp @@ -433,7 +433,8 @@ SBError Driver::ProcessArgs(const opt::InputArgList &args, bool &exiting) { return error; } -#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH) +#ifdef _WIN32 +#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH /// Returns the full path to the lldb.exe executable. inline std::wstring GetPathToExecutableW() { // Iterate until we reach the Windows API maximum path length (32,767). @@ -447,30 +448,73 @@ inline std::wstring GetPathToExecutableW() { return L""; } -/// Resolve the full path of the directory defined by +/// \brief Resolve the full path of the directory defined by /// LLDB_PYTHON_DLL_RELATIVE_PATH. If it exists, add it to the list of DLL /// search directories. -void AddPythonDLLToSearchPath() { +/// \return `true` if the library was added to the search path. +/// `false` otherwise. +bool AddPythonDLLToSearchPath() { std::wstring modulePath = GetPathToExecutableW(); - if (modulePath.empty()) { - llvm::errs() << "error: unable to find python.dll." << '\n'; - return; - } + if (modulePath.empty()) + return false; SmallVector<char, MAX_PATH> utf8Path; if (sys::windows::UTF16ToUTF8(modulePath.c_str(), modulePath.length(), utf8Path)) - return; + return false; sys::path::remove_filename(utf8Path); sys::path::append(utf8Path, LLDB_PYTHON_DLL_RELATIVE_PATH); sys::fs::make_absolute(utf8Path); SmallVector<wchar_t, 1> widePath; if (sys::windows::widenPath(utf8Path.data(), widePath)) - return; + return false; if (sys::fs::exists(utf8Path)) - SetDllDirectoryW(widePath.data()); + return SetDllDirectoryW(widePath.data()); + return false; +} +#endif + +#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME +/// Returns whether `python3x.dll` is in the DLL search path. +bool IsPythonDLLInPath() { +#define WIDEN2(x) L##x +#define WIDEN(x) WIDEN2(x) + WCHAR foundPath[MAX_PATH]; + DWORD result = + SearchPathW(nullptr, WIDEN(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME), nullptr, + MAX_PATH, foundPath, nullptr); +#undef WIDEN2 +#undef WIDEN + + return result > 0; +} +#endif + +/// Try to setup the DLL search path for the Python Runtime Library +/// (python3xx.dll). +/// +/// If `LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME` is set, we first check if +/// python3xx.dll is in the search path. If it's not, we try to add it and +/// check for it a second time. +/// If only `LLDB_PYTHON_DLL_RELATIVE_PATH` is set, we try to add python3xx.dll +/// to the search path python.dll is already in the search path or not. +void SetupPythonRuntimeLibrary() { +#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME + if (IsPythonDLLInPath()) + return; +#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH + if (AddPythonDLLToSearchPath() && IsPythonDLLInPath()) + return; +#endif + llvm::errs() << "error: unable to find '" + << LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME << "'.\n"; + return; +#elif defined(LLDB_PYTHON_DLL_RELATIVE_PATH) + if (!AddPythonDLLToSearchPath()) + llvm::errs() << "error: unable to find the Python runtime library.\n"; +#endif } #endif @@ -776,8 +820,8 @@ int main(int argc, char const *argv[]) { "~/Library/Logs/DiagnosticReports/.\n"); #endif -#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH) - AddPythonDLLToSearchPath(); +#ifdef _WIN32 + SetupPythonRuntimeLibrary(); #endif // Parse arguments. From d4ca474ca3105743311130134562b40fddce7a56 Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang <avogelsgesang@salesforce.com> Date: Mon, 3 Nov 2025 12:03:43 -0800 Subject: [PATCH 062/313] [libc++][docs] Fix documentation of `REQUIRES: std-at-least-*` (#166221) --- libcxx/docs/TestingLibcxx.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst index dbe69484abedf..062c09571eafe 100644 --- a/libcxx/docs/TestingLibcxx.rst +++ b/libcxx/docs/TestingLibcxx.rst @@ -451,7 +451,7 @@ Instead use: .. code-block:: cpp - // UNSUPPORTED: std-at-least-c++26 + // REQUIRES: requires-std-at-least-c++26 There is no corresponding ``std-at-most-c++23``. This could be useful when tests are only valid for a small set of standard versions. For example, a From 42b608cdd6cd96e4fab7e4311731d6b0956376ad Mon Sep 17 00:00:00 2001 From: Adrian Prantl <aprantl@apple.com> Date: Mon, 3 Nov 2025 12:08:56 -0800 Subject: [PATCH 063/313] [lldb] Skip tests on older versions of clang --- .../API/commands/expression/weak_symbols/TestWeakSymbols.py | 2 +- .../TestLibcxxInternalsRecognizer.py | 2 +- .../API/lang/objc/modules-auto-import/TestModulesAutoImport.py | 1 + .../lang/objc/modules-objc-property/TestModulesObjCProperty.py | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py index 50efecbc88c36..bed129a7a7a8c 100644 --- a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py +++ b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py @@ -15,7 +15,7 @@ class TestWeakSymbolsInExpressions(TestBase): NO_DEBUG_INFO_TESTCASE = True @skipUnlessDarwin - @skipIf(compiler="clang", compiler_version=["<", "7.0"]) + @skipIf(compiler="clang", compiler_version=["<", "19.0"]) def test_weak_symbol_in_expr(self): """Tests that we can refer to weak symbols in expressions.""" self.build() diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py index 2f942da604ff2..d8a729b322fe4 100644 --- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py +++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py @@ -9,7 +9,7 @@ class LibCxxInternalsRecognizerTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True @add_test_categories(["libc++"]) - @skipIf(compiler="clang", compiler_version=["<=", "19.0"]) + @skipIf(compiler="clang", compiler_version=["<", "19.0"]) def test_frame_recognizer(self): """Test that implementation details of libc++ are hidden""" self.build() diff --git a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py index 142d27ddad37f..f3558f62d51f8 100644 --- a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py +++ b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py @@ -16,6 +16,7 @@ def setUp(self): self.line = line_number("main.m", "// Set breakpoint 0 here.") @skipIf(macos_version=["<", "10.12"]) + @skipIf(compiler="clang", compiler_version=["<", "19.0"]) def test_expr(self): self.build() exe = self.getBuildArtifact("a.out") diff --git a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py index 3be064ae7d5f8..657a7103ee989 100644 --- a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py +++ b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py @@ -6,6 +6,7 @@ class TestCase(TestBase): @no_debug_info_test + @skipIf(compiler="clang", compiler_version=["<", "19.0"]) def test_conflicting_properties(self): """Tests receiving two properties with the same name from modules.""" self.build() From 009706ff6295882a17fb2af1a1eebdfe7c476114 Mon Sep 17 00:00:00 2001 From: Aiden Grossman <aidengrossman@google.com> Date: Mon, 3 Nov 2025 20:09:18 +0000 Subject: [PATCH 064/313] [Github] Use truncated body in llvm-bugs.yml \#166081 forgot to actually use this as the body. --- .github/workflows/llvm-bugs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml index cd3f396e7c465..3274f1adf9e61 100644 --- a/.github/workflows/llvm-bugs.yml +++ b/.github/workflows/llvm-bugs.yml @@ -52,7 +52,7 @@ jobs: url : issue.data.html_url, labels : issue.data.labels.map((label) => label.name), assignee : issue.data.assignees.map((assignee) => assignee.login), - body : issue.data.body + body : maybeTruncatedBody }; const data = { From dccced25a01478c339e37fd7ef30c0958cb43742 Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang <avogelsgesang@salesforce.com> Date: Mon, 3 Nov 2025 12:13:27 -0800 Subject: [PATCH 065/313] [libc++][docs] Fix documentation of `REQUIRES: std-at-least-*` (#166226) Due to me not double-checking my PR, an overly eager AI auto-completion made it into my previous PR :/ --- libcxx/docs/TestingLibcxx.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst index 062c09571eafe..e15c5b1a5d32f 100644 --- a/libcxx/docs/TestingLibcxx.rst +++ b/libcxx/docs/TestingLibcxx.rst @@ -451,7 +451,7 @@ Instead use: .. code-block:: cpp - // REQUIRES: requires-std-at-least-c++26 + // REQUIRES: std-at-least-c++26 There is no corresponding ``std-at-most-c++23``. This could be useful when tests are only valid for a small set of standard versions. For example, a From f02b661054547b423177c9498cdb554f5036a3e0 Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang <avogelsgesang@salesforce.com> Date: Mon, 3 Nov 2025 12:18:43 -0800 Subject: [PATCH 066/313] [libc++] Add move constructor & assignment to `exception_ptr` (#164281) This commit adds move constructor, move assignment and `swap` to `exception_ptr`. Adding those operators allows us to avoid unnecessary calls to `__cxa_{inc,dec}rement_refcount`. Performance results (from libc++'s CI): ``` Benchmark Baseline Candidate Difference % Difference ------------------------------------ ---------- ----------- ------------ -------------- bm_exception_ptr_copy_assign_nonnull 9.77 9.94 0.18 1.79% bm_exception_ptr_copy_assign_null 10.29 10.65 0.35 3.42% bm_exception_ptr_copy_ctor_nonnull 7.02 7.01 -0.01 -0.13% bm_exception_ptr_copy_ctor_null 10.54 10.60 0.06 0.56% bm_exception_ptr_move_assign_nonnull 16.92 13.76 -3.16 -18.70% bm_exception_ptr_move_assign_null 10.61 10.76 0.14 1.36% bm_exception_ptr_move_ctor_nonnull 13.31 10.25 -3.06 -23.02% bm_exception_ptr_move_ctor_null 10.28 7.30 -2.98 -28.95% bm_exception_ptr_swap_nonnull 19.22 0.63 -18.59 -96.74% bm_exception_ptr_swap_null 20.02 7.79 -12.23 -61.07% ``` As expected, the `bm_exception_ptr_copy_*` benchmarks are not influenced by this change. `bm_exception_ptr_move_*` benefits between 18% and 30%. The `bm_exception_ptr_swap_*` tests show the biggest improvements since multiple calls to the copy constructor are replaced by a simple pointer swap. While `bm_exception_ptr_move_assign_null` did not show a regression in the CI measurements, local measurements showed a regression from 3.98 to 4.71, i.e. by 18%. This is due to the additional `__tmp` inside `operator=`. The destructor of `__other` is a no-op after the move because `__other.__ptr` will be a nullptr. However, the compiler does not realize this, since the destructor is not inlined and is lacking a fast-path. As such, the swap-based implementation leads to an additional destructor call. `bm_exception_ptr_move_assign_nonnull` still benefits because the swap-based move constructor avoids unnecessary __cxa_{in,de}crement_refcount calls. As soon as we inline the destructor, this regression should disappear again. Works towards #44892 --- libcxx/include/__exception/exception_ptr.h | 23 ++++++++++ libcxx/modules/std/exception.inc | 1 + .../propagation/exception_ptr.pass.cpp | 1 - .../exception_ptr_move_assignment.pass.cpp | 45 +++++++++++++++++++ .../exception_ptr_move_ctr.pass.cpp | 43 ++++++++++++++++++ .../propagation/exception_ptr_swap.pass.cpp | 40 +++++++++++++++++ 6 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp create mode 100644 libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp create mode 100644 libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h index 796fa924be121..e78126ea23852 100644 --- a/libcxx/include/__exception/exception_ptr.h +++ b/libcxx/include/__exception/exception_ptr.h @@ -16,6 +16,8 @@ #include <__memory/construct_at.h> #include <__type_traits/decay.h> #include <__type_traits/is_pointer.h> +#include <__utility/move.h> +#include <__utility/swap.h> #include <cstdlib> #include <typeinfo> @@ -23,6 +25,9 @@ # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #ifndef _LIBCPP_ABI_MICROSOFT # if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION @@ -57,6 +62,8 @@ _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD #ifndef _LIBCPP_ABI_MICROSOFT +inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT; + class _LIBCPP_EXPORTED_FROM_ABI exception_ptr { void* __ptr_; @@ -75,7 +82,15 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr { _LIBCPP_HIDE_FROM_ABI exception_ptr(nullptr_t) _NOEXCEPT : __ptr_() {} exception_ptr(const exception_ptr&) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI exception_ptr(exception_ptr&& __other) _NOEXCEPT : __ptr_(__other.__ptr_) { + __other.__ptr_ = nullptr; + } exception_ptr& operator=(const exception_ptr&) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI exception_ptr& operator=(exception_ptr&& __other) _NOEXCEPT { + exception_ptr __tmp(std::move(__other)); + std::swap(__tmp, *this); + return *this; + } ~exception_ptr() _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return __ptr_ != nullptr; } @@ -88,10 +103,16 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr { return !(__x == __y); } + friend _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT; + friend _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT; friend _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr); }; +inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT { + std::swap(__x.__ptr_, __y.__ptr_); +} + # if _LIBCPP_HAS_EXCEPTIONS # if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION template <class _Ep> @@ -201,4 +222,6 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT { #endif // _LIBCPP_ABI_MICROSOFT _LIBCPP_END_UNVERSIONED_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___EXCEPTION_EXCEPTION_PTR_H diff --git a/libcxx/modules/std/exception.inc b/libcxx/modules/std/exception.inc index 02b0f80190e5b..3dbc0112c15a0 100644 --- a/libcxx/modules/std/exception.inc +++ b/libcxx/modules/std/exception.inc @@ -18,6 +18,7 @@ export namespace std { using std::rethrow_exception; using std::rethrow_if_nested; using std::set_terminate; + using std::swap; using std::terminate; using std::terminate_handler; using std::throw_with_nested; diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp index 0aded33e660d5..7e25d40dc8a7d 100644 --- a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp @@ -14,7 +14,6 @@ #include <exception> #include <cassert> -#include <type_traits> #include "test_macros.h" diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp new file mode 100644 index 0000000000000..6882bc6548da3 --- /dev/null +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-exceptions, c++03 + +// <exception> + +// typedef unspecified exception_ptr; + +// Test the move assignment of exception_ptr + +#include <exception> +#include <utility> +#include <cassert> + +#include "test_macros.h" + +int main(int, char**) { + std::exception_ptr p = std::make_exception_ptr(42); + std::exception_ptr p2{p}; + assert(p2 == p); + // Under test: the move assignment + std::exception_ptr p3; + p3 = std::move(p2); + assert(p3 == p); +// `p2` was moved from. In libc++ it will be nullptr, but +// this is not guaranteed by the standard. +#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT) + assert(p2 == nullptr); + assert(p2 == nullptr); +#endif + + try { + std::rethrow_exception(p3); + } catch (int e) { + assert(e == 42); + } + + return 0; +} diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp new file mode 100644 index 0000000000000..122e229fd6e47 --- /dev/null +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-exceptions, c++03 + +// <exception> + +// typedef unspecified exception_ptr; + +// Test the move constructor of exception_ptr + +#include <exception> +#include <utility> +#include <cassert> + +#include "test_macros.h" + +int main(int, char**) { + std::exception_ptr p = std::make_exception_ptr(42); + std::exception_ptr p2{p}; + assert(p2 == p); + // Under test: The move constructor + std::exception_ptr p3{std::move(p2)}; + assert(p3 == p); +// `p2` was moved from. In libc++ it will be nullptr, but +// this is not guaranteed by the standard. +#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT) + assert(p2 == nullptr); +#endif + + try { + std::rethrow_exception(p3); + } catch (int e) { + assert(e == 42); + } + + return 0; +} diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp new file mode 100644 index 0000000000000..82b4713bed538 --- /dev/null +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp @@ -0,0 +1,40 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-exceptions + +// <exception> + +// typedef unspecified exception_ptr; + +// Test swapping of exception_ptr + +#include <exception> +#include <utility> +#include <cassert> + +#include "test_macros.h" + +int main(int, char**) { + std::exception_ptr p21 = std::make_exception_ptr(42); + std::exception_ptr p42 = std::make_exception_ptr(21); + std::swap(p42, p21); + + try { + std::rethrow_exception(p21); + } catch (int e) { + assert(e == 21); + } + try { + std::rethrow_exception(p42); + } catch (int e) { + assert(e == 42); + } + + return 0; +} From ccc473254fd2d0da01921e8402fbd4f678ff46f1 Mon Sep 17 00:00:00 2001 From: Doug Wyatt <doug@sonosphere.com> Date: Mon, 3 Nov 2025 12:26:49 -0800 Subject: [PATCH 067/313] [Clang] FunctionEffects: properly extract the type of a bound member member function from a CallExpr. (#166101) There's a bug illustrated by this example: ``` template <typename T> struct Holder { T value; T& operator*() { return value; } }; struct X { using Dispatch = float (X::*)() [[clang::nonblocking]]; void fails(Holder<Dispatch>& holder) [[clang::nonblocking]] { (this->*(*holder))(); <<< the expression is incorrectly determined not to be nonblocking } void succeeds(Holder<Dispatch>& holder) [[clang::nonblocking]] { auto func = *holder; (this->*func)(); } }; ``` In both cases we have a `CXXMemberCallExpr`. In `succeeds`, the expression refers to a `Decl` (`func`) and gets a useful PTMF type. In `fails`, the expression does not refer to a `Decl` and its type is special, printed as `bound member function`. `Expr` provides a method for extracting the true type so we can use that in this situation. --------- Co-authored-by: Doug Wyatt <dwyatt@apple.com> Co-authored-by: Sirraide <aeternalmail@gmail.com> --- clang/lib/Sema/SemaFunctionEffects.cpp | 12 +++++-- .../Sema/attr-nonblocking-constraints.cpp | 33 +++++++++++++++---- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp index 8590ee831084f..5459861ec349d 100644 --- a/clang/lib/Sema/SemaFunctionEffects.cpp +++ b/clang/lib/Sema/SemaFunctionEffects.cpp @@ -1208,8 +1208,16 @@ class Analyzer { return true; } - // No Decl, just an Expr. Just check based on its type. - checkIndirectCall(Call, CalleeExpr->getType()); + // No Decl, just an Expr. Just check based on its type. Bound member + // functions are a special expression type and need to be specially + // unpacked. + QualType CalleeExprQT = CalleeExpr->getType(); + if (CalleeExpr->isBoundMemberFunction(Outer.S.getASTContext())) { + QualType QT = Expr::findBoundMemberType(CalleeExpr); + if (!QT.isNull()) + CalleeExprQT = QT; + } + checkIndirectCall(Call, CalleeExprQT); return true; } diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp index b26a945843696..0d2dbb4947dc8 100644 --- a/clang/test/Sema/attr-nonblocking-constraints.cpp +++ b/clang/test/Sema/attr-nonblocking-constraints.cpp @@ -235,16 +235,35 @@ void nb13() [[clang::nonblocking]] { nb12(); } // C++ member function pointers struct PTMFTester { typedef void (PTMFTester::*ConvertFunction)() [[clang::nonblocking]]; - - void convert() [[clang::nonblocking]]; + typedef void (PTMFTester::*BlockingFunction)(); ConvertFunction mConvertFunc; -}; -void PTMFTester::convert() [[clang::nonblocking]] -{ - (this->*mConvertFunc)(); -} + void convert() [[clang::nonblocking]] + { + (this->*mConvertFunc)(); // This should not generate a warning. + } + + template <typename T> + struct Holder { + T value; + + T& operator*() { return value; } + }; + + + void ptmfInExpr(Holder<ConvertFunction>& holder) [[clang::nonblocking]] + { + (this->*(*holder))(); // Should not generate a warning. + ((*this).*(*holder))(); // Should not generate a warning. + } + + void ptmfInExpr(Holder<BlockingFunction>& holder) [[clang::nonblocking]] + { + (this->*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}} + ((*this).*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}} + } +}; // Allow implicit conversion from array to pointer. void nb14(unsigned idx) [[clang::nonblocking]] From c081fb058831cedce8466809e0d15daadd3ccad2 Mon Sep 17 00:00:00 2001 From: Craig Topper <craig.topper@sifive.com> Date: Mon, 3 Nov 2025 12:30:11 -0800 Subject: [PATCH 068/313] [RISCV] Removed unused OPERAND_SIMM8. NFC (#166215) --- llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index e75dfe33814c6..5b8cfb2100b26 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -407,7 +407,6 @@ enum OperandType : unsigned { OPERAND_SIMM5_PLUS1, OPERAND_SIMM6, OPERAND_SIMM6_NONZERO, - OPERAND_SIMM8, OPERAND_SIMM8_UNSIGNED, OPERAND_SIMM10, OPERAND_SIMM10_LSB0000_NONZERO, From 0623497a0fcdb8cd32139c184c9f0d70dcd690f1 Mon Sep 17 00:00:00 2001 From: Craig Topper <craig.topper@sifive.com> Date: Mon, 3 Nov 2025 12:35:06 -0800 Subject: [PATCH 069/313] [RISCV] Mark FLH as canFoldAsLoad. (#165974) --- llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td | 1 + llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll | 41 +++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index c31713e967b18..1c6a5afcda49b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -90,6 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext]; //===----------------------------------------------------------------------===// let Predicates = [HasHalfFPLoadStoreMove] in { +let canFoldAsLoad = 1 in def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll new file mode 100644 index 0000000000000..bf0a2e5e35a01 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll @@ -0,0 +1,41 @@ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh < %s | FileCheck %s + +; CHECK-LABEL: .section .llvm_stackmaps +; CHECK-NEXT: __LLVM_StackMaps: +; Header +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 0 +; Num Functions +; CHECK-NEXT: .word 1 +; Num LargeConstants +; CHECK-NEXT: .word 0 +; Num Callsites +; CHECK-NEXT: .word 1 + +; Functions and stack size +; CHECK-NEXT: .quad liveArgs +; CHECK-NEXT: .quad 0 +; CHECK-NEXT: .quad 1 + +; Spilled stack map values. +; +; Verify 3 stack map entries. +; +; CHECK-LABEL: .word .L{{.*}}-liveArgs +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 25 +; +; Check that at least one is a spilled entry from SP. +; Location: Indirect SP + ... +; CHECK: .byte 3 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +define void @liveArgs(double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) { +entry: + call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) + ret void +} From ca00234c09e03bdb3471c83a24f1b8bc1fdb31f9 Mon Sep 17 00:00:00 2001 From: Craig Topper <craig.topper@sifive.com> Date: Mon, 3 Nov 2025 12:35:54 -0800 Subject: [PATCH 070/313] [RISCV] Correct comments in rv64-stackmap.ll to not use X86 register name. NFC (#165912) Note, X86 forces a frame pointer for stackmaps/patchpoint. So they use RBP where we use SP. --- llvm/test/CodeGen/RISCV/rv64-stackmap.ll | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll index c50a0fb3ffe91..320a3aa94cd7d 100644 --- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll +++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll @@ -286,8 +286,8 @@ define void @liveConstant() { ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .half 28 ; -; Check that at least one is a spilled entry from RBP. -; Location: Indirect RBP + ... +; Check that at least one is a spilled entry from SP. +; Location: Indirect SP + ... ; CHECK: .byte 3 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 @@ -307,7 +307,7 @@ entry: ; CHECK-NEXT: .half 0 ; 1 location ; CHECK-NEXT: .half 1 -; Loc 0: Direct RBP - ofs +; Loc 0: Direct SP + ofs ; CHECK-NEXT: .byte 2 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 @@ -320,14 +320,14 @@ entry: ; CHECK-NEXT: .half 0 ; 2 locations ; CHECK-NEXT: .half 2 -; Loc 0: Direct RBP - ofs +; Loc 0: Direct SP + ofs ; CHECK-NEXT: .byte 2 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 ; CHECK-NEXT: .half 2 ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .word -; Loc 1: Direct RBP - ofs +; Loc 1: Direct SP + ofs ; CHECK-NEXT: .byte 2 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 From 68c4c83bcbf9612a02074b946fe6bb73054183ef Mon Sep 17 00:00:00 2001 From: Artem Kroviakov <71938912+akroviakov@users.noreply.github.com> Date: Mon, 3 Nov 2025 21:48:27 +0100 Subject: [PATCH 071/313] [MLIR][XeGPU] Matrix load/store subgroup distribution (#165008) --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 4 +- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 35 ++-- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 2 +- .../Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp | 2 + mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 162 ++++++++------- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 38 +++- .../Transforms/XeGPUSubgroupDistribute.cpp | 186 +++++++++++++++++- .../Transforms/XeGPUWgToSgDistribute.cpp | 10 +- mlir/test/Dialect/XeGPU/invalid.mlir | 31 +-- .../Dialect/XeGPU/subgroup-distribute.mlir | 63 ++++++ .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 3 +- 11 files changed, 417 insertions(+), 119 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 1481859e94a92..0c059967bb898 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -30,9 +30,11 @@ class SliceAttr; } // namespace xegpu } // namespace mlir +// clang-format off +#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc> #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc> #include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc> -#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc> +// clang-format on #define GET_ATTRDEF_CLASSES #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc> diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 40352b44b6441..9c35c07a7e587 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -223,17 +223,17 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { InterfaceMethod<"Derive a new layout by dropping InstData", "xegpu::DistributeLayoutAttr", "dropInstData">, - InterfaceMethod<[{Delinearizes a linear subgroup ID into its multidimensional - indices based on the effective subgroup layout.}], + InterfaceMethod<[{Delinearizes a linear ID into its multidimensional + indices based on the effective layout level.}], "FailureOr<SmallVector<Value>>", - "delinearizeSubgroupId", + "delinearizeId", (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>, - InterfaceMethod<[{Generates instructions to compute multidimensional offsets for blocks - assigned to a subgroup identified by linearId. The shape parameter - represents the workgroup-level problem size. Each subgroup may access + InterfaceMethod<[{Generates instructions to compute multidimensional coordinates for dist units + assigned to a level identified by linearId. The shape parameter + represents the higher-level problem size. Each level may access multiple blocks according to round-robin distribution rules.}], "FailureOr<SmallVector<SmallVector<Value>>>", - "getOffsets", + "computeDistributedCoords", (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>, InterfaceMethod</*desc=*/[{Check if this layout can be achieved by applying a transpose to some other layout according to given permutation of (0...n-1).}], @@ -476,17 +476,17 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { return {}; } - /// Delinearizes a linear subgroup ID into its multidimensional indices - /// based on the effective subgroup layout. + /// Delinearizes a linear ID into its multidimensional indices + /// based on the effective level of the layout. FailureOr<SmallVector<Value>> - delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId); + delinearizeId(OpBuilder &builder, Location loc, Value linearId); - /// Generates instructions to compute multidimensional offsets for blocks - /// assigned to a subgroup identified by linearId. The shape parameter - /// represents the workgroup-level problem size. Each subgroup may access + /// Generates instructions to compute multidimensional coordinates for dist units + /// assigned to a level identified by linearId. The shape parameter + /// represents the higher-level problem size. Each `level` may access /// multiple blocks according to round-robin distribution rules. FailureOr<SmallVector<SmallVector<Value>>> - getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape); + computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape); /// Check if this is slice of some other layout. bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; } @@ -643,14 +643,15 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> { /// Delinearizes a linear subgroup ID into its multidimensional indices /// based on the effective subgroup layout. FailureOr<SmallVector<Value>> - delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId); + delinearizeId(OpBuilder &builder, Location loc, Value linearId); - /// Generates instructions to compute multidimensional offsets for blocks + /// Generates instructions to compute multidimensional coordinates for blocks /// assigned to a subgroup identified by linearId. The shape parameter /// represents the workgroup-level problem size. Each subgroup may access /// multiple blocks according to round-robin distribution rules. + FailureOr<SmallVector<SmallVector<Value>>> - getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape); + computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape); /// Check if this is slice of some other layout. bool isSliceOf(const xegpu::DistributeLayoutAttr &other); diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index b7af5413669c9..eb05628d4772b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -26,7 +26,7 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { The pass distributes subgroup level (SIMD) XeGPU ops to work items. }]; let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", - "vector::VectorDialect"]; + "vector::VectorDialect", "index::IndexDialect"]; } def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp index 33e8f2ed1f6ed..de552ce22ef62 100644 --- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp +++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp @@ -562,6 +562,8 @@ class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> { VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType()); if (!valOrResVecTy) valOrResVecTy = VectorType::get(1, data.getType()); + if (valOrResVecTy.getShape().size() != 1) + return rewriter.notifyMatchFailure(op, "Expected 1D data vector."); int64_t elemBitWidth = valOrResVecTy.getElementType().getIntOrFloatBitWidth(); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 83406c8c75dcf..397107b786c9e 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -37,55 +37,61 @@ void XeGPUDialect::initialize() { >(); } -/// Generates instructions to compute offsets for a subgroup identified by -/// its multidimensional indices (sgId), using the specified subgroup layout -/// (sgLayout), subgroup data dimensions (sizePerSg), and the overall data -/// dimensions (sizePerWg). +// A `srcShape` consists of N distribution units, each being `subShapesLayout` x +// `subShape`. A `delinearizedId` is used to identify a particular `subShape` +// within each distribution unit. +// Example: +// WG data is 128x256. SG data is 16x32, in 4x2 layout, this gives a +// distribution unit of shape 64x64, we have 2x4 such distribution units. +// `delinearizedId` is used to identify a 16x32 of a subgroup in each +// distribution unit. static SmallVector<SmallVector<Value>> -genOffsetsComputingInsts(OpBuilder &builder, Location loc, - SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout, - ArrayRef<int64_t> sizePerSg, - ArrayRef<int64_t> sizePerWg) { - - SmallVector<SmallVector<Value>> offsets; +genCoordinates(OpBuilder &builder, Location loc, + SmallVector<Value> delinearizedId, + ArrayRef<int64_t> subShapesLayout, ArrayRef<int64_t> subShape, + ArrayRef<int64_t> srcShape) { + SmallVector<SmallVector<Value>> coordinates; + + // A distribution unit must be less than or equal to `srcShape` + SmallVector<int64_t> distUnitShape = llvm::map_to_vector( + llvm::zip_equal(srcShape, + computeElementwiseMul(subShapesLayout, subShape)), + [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); }); - // nd local offset, localOffset[i] = sgId[i] * sizePerSg[i] - SmallVector<Value> localOffsets = llvm::map_to_vector( - llvm::zip(sgId, sizePerSg), [&](const auto &t) -> Value { + // Get the offset of `subShape` within a distribution unit. + SmallVector<Value> distUnitLocalOffset = llvm::map_to_vector( + llvm::zip(delinearizedId, subShape), [&](const auto &t) -> Value { return builder.createOrFold<index::MulOp>( loc, std::get<0>(t), builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t))); }); - // distUnit[i] is the minimum value between sizePerWg[i] and - // sgLayout[i] * sizePerSg[i] - SmallVector<int64_t> distUnit = llvm::map_to_vector( - llvm::zip_equal(sizePerWg, computeElementwiseMul(sgLayout, sizePerSg)), - [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); }); - + // For each dist unit for (SmallVector<int64_t> unitOffs : - StaticTileOffsetRange(sizePerWg, distUnit)) { + StaticTileOffsetRange(srcShape, distUnitShape)) { + // Get dist unit offset within `srcShape`. SmallVector<Value> base = llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value { return arith::ConstantIndexOp::create(builder, loc, d); }); - - SmallVector<Value> adds = llvm::map_to_vector( - llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value { - return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t), - std::get<1>(t)); - }); - + // Calculate `subShape` offset within `srcShape`. + SmallVector<Value> adds = + llvm::map_to_vector(llvm::zip_equal(base, distUnitLocalOffset), + [&](const auto &t) -> Value { + return builder.createOrFold<arith::AddIOp>( + loc, std::get<0>(t), std::get<1>(t)); + }); + // Do not go beyond `srcShape` bounds. SmallVector<Value> mods = llvm::map_to_vector( - llvm::zip_equal(adds, sizePerWg), [&](const auto &t) -> Value { + llvm::zip_equal(adds, srcShape), [&](const auto &t) -> Value { return builder.createOrFold<index::RemUOp>( loc, std::get<0>(t), arith::ConstantIndexOp::create(builder, loc, std::get<1>(t))); }); - offsets.push_back(mods); + coordinates.push_back(mods); } - return offsets; + return coordinates; } // Checks if the given shape can be evenly distributed based on the layout @@ -272,12 +278,7 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError, } FailureOr<SmallVector<Value>> -LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc, - Value linearId) { - // delinearizeSubgroupId is only available for - // workgroup-level layout attribute - if (!isForWorkgroup()) - return failure(); +LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) { // TODO: handle order attribute auto hasDefaultOrder = [&]() { @@ -287,41 +288,52 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc, }; if (!hasDefaultOrder()) return mlir::emitError(loc, "order attribute is currently not supported."); - - auto dims = - llvm::map_to_vector(getEffectiveSgLayoutAsInt(), [&](int64_t d) -> Value { - return builder.createOrFold<arith::ConstantIndexOp>(loc, d); - }); + SmallVector<int64_t> layout; + if (isForWorkgroup()) { + layout = getEffectiveSgLayoutAsInt(); + } else if (isForSubgroup()) { + layout = getEffectiveLaneLayoutAsInt(); + } else { + return failure(); + } + auto dims = llvm::map_to_vector(layout, [&](int64_t d) -> Value { + return builder.createOrFold<arith::ConstantIndexOp>(loc, d); + }); return affine::delinearizeIndex(builder, loc, linearId, dims); } -/// Implements DistributeLayoutAttr::getOffsets to generate +/// Implements DistributeLayoutAttr::computeDistributedCoords to generate /// instructions for computing multi-dimensional offsets when distributed by /// LayoutAttr. FailureOr<SmallVector<SmallVector<Value>>> -LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId, - ArrayRef<int64_t> shape) { - if (!isForWorkgroup()) +LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc, + Value linearId, ArrayRef<int64_t> shape) { + SmallVector<int64_t> layout; + SmallVector<int64_t> subShape; + if (isForWorkgroup()) { + layout = getEffectiveSgLayoutAsInt(); + subShape = getEffectiveSgDataAsInt(); + } else if (isForSubgroup()) { + layout = getEffectiveLaneLayoutAsInt(); + subShape = getEffectiveLaneDataAsInt(); + } else { return failure(); - - SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt(); - SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt(); - if (sgShape.empty()) { - if (auto derivedShape = computeShapeRatio(shape, sgLayout)) - sgShape = derivedShape.value(); + } + if (subShape.empty()) { + if (auto derivedShape = computeShapeRatio(shape, layout)) + subShape = derivedShape.value(); else return failure(); } // delinearize Ids - auto maybeIds = delinearizeSubgroupId(builder, loc, linearId); + auto maybeIds = delinearizeId(builder, loc, linearId); if (failed(maybeIds)) return failure(); - SmallVector<Value> sgIds = *maybeIds; + SmallVector<Value> ids = *maybeIds; - return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape, - shape); + return genCoordinates(builder, loc, ids, layout, subShape, shape); } //===----------------------------------------------------------------------===// @@ -375,34 +387,43 @@ SliceAttr SliceAttr::flatten() const { } FailureOr<SmallVector<Value>> -SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc, - Value linearId) { +SliceAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) { SliceAttr attr = flatten(); auto parent = dyn_cast<LayoutAttr>(attr.getParent()); - return parent.delinearizeSubgroupId(builder, loc, linearId); + return parent.delinearizeId(builder, loc, linearId); } -/// Implements DistributeLayoutAttr::getOffsets to generate -/// instructions for computing multi-dimensional offsets when distributed by -/// SliceAttr. +// Implements DistributeLayoutAttr::computeDistributedCoords to generate +// instructions for computing multi-dimensional offsets when distributed by +// LayoutAttr. FailureOr<SmallVector<SmallVector<Value>>> -SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId, - ArrayRef<int64_t> shape) { +SliceAttr::computeDistributedCoords(OpBuilder &builder, Location loc, + Value linearId, ArrayRef<int64_t> shape) { assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape."); if (!isForWorkgroup()) return failure(); - SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt(); - SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt(); - if (sgShape.empty()) { - if (auto derivedShape = computeShapeRatio(shape, sgLayout)) - sgShape = derivedShape.value(); + SmallVector<int64_t> layout; + SmallVector<int64_t> subShape; + if (isForWorkgroup()) { + layout = getEffectiveSgLayoutAsInt(); + subShape = getEffectiveSgDataAsInt(); + } else if (isForSubgroup()) { + layout = getEffectiveLaneLayoutAsInt(); + subShape = getEffectiveLaneDataAsInt(); + } else { + return failure(); + } + + if (subShape.empty()) { + if (auto derivedShape = computeShapeRatio(shape, layout)) + subShape = derivedShape.value(); else return failure(); } // delinearize Ids - auto maybeIds = delinearizeSubgroupId(builder, loc, linearId); + auto maybeIds = delinearizeId(builder, loc, linearId); if (failed(maybeIds)) return failure(); @@ -412,8 +433,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId, SmallVector<Value> sgIds = XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims); - return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape, - shape); + return genCoordinates(builder, loc, sgIds, layout, subShape, shape); } bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) { diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index abd12e2e69ac0..7b6c4b6c2c813 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -175,13 +175,13 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy, LogicalResult IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy, - UnitAttr subgroup_block_io, + UnitAttr subgroup_block_io, DistributeLayoutAttr layout, function_ref<InFlightDiagnostic()> emitError) { if (!dataTy) { if (subgroup_block_io) return emitError() << "subgroup_block_io " - "are only allowed when result is a 1D VectorType."; + "are only allowed when result is a VectorType."; else return success(); } @@ -192,15 +192,37 @@ IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy, ArrayRef<int64_t> dataShape = dataTy.getShape(); ArrayRef<int64_t> mdescShape = mdescTy.getShape(); + SmallVector<int64_t> blockShape = mdescTy.getBlockShape(); + ArrayAttr strideAttr = mdescTy.getStrideAttr(); + SmallVector<int64_t> strides; + for (Attribute attr : strideAttr.getValue()) { + strides.push_back(cast<IntegerAttr>(attr).getInt()); + } + if (subgroup_block_io && layout) { + auto laneData = layout.getEffectiveLaneDataAsInt(); + auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); + if (!laneData.empty()) { + bool isLaneDataContiguous = + std::all_of(laneData.begin(), std::prev(laneData.end()), + [](int x) { return x == 1; }); + if (!isLaneDataContiguous) + return emitError() << "With subgroup_block_io, accessed data must be " + "contiguous and coalesced."; + for (size_t i = 0; i < laneData.size(); ++i) { + if (laneLayout[i] != blockShape[i]) + return emitError() << "With subgroup_block_io, the block shape must " + "match the lane layout."; + if (laneLayout[i] != 1 && strides[i] != 1) + return emitError() << "With subgroup_block_io, the distributed " + "dimensions must be contiguous."; + } + } + } if (dataShape.size() == 2) { - if (subgroup_block_io) - return emitError() << "subgroup_block_io " - "are only allowed when result is a 1D VectorType."; if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape), [](auto p) { return std::get<0>(p) > std::get<1>(p); })) return emitError() << "data shape must not exceed mem_desc shape."; } else { - SmallVector<int64_t> blockShape = mdescTy.getBlockShape(); // if the subgroup_block_io attribute is set, mdescTy must have block // attribute if (subgroup_block_io && !blockShape.size()) @@ -1105,7 +1127,7 @@ LogicalResult LoadMatrixOp::verify() { MemDescType mdescTy = getMemDesc().getType(); return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io, - [&]() { return emitError(); }); + getLayoutAttr(), [&]() { return emitError(); }); } //===----------------------------------------------------------------------===// @@ -1129,7 +1151,7 @@ LogicalResult StoreMatrixOp::verify() { UnitAttr subgroup_block_io = getSubgroupBlockIoAttr(); MemDescType mdescTy = getMemDesc().getType(); return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io, - [&]() { return emitError(); }); + getLayoutAttr(), [&]() { return emitError(); }); } namespace mlir { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 5a3b27ec6108e..bbd7733e89c29 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Utils/DistributionUtils.h" +#include "mlir/Dialect/Index/IR/IndexDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" @@ -912,6 +913,186 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { } }; +static SmallVector<Value> computeDistributedCoordinatesForMatrixOp( + PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout, + Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) { + SmallVector<Value> newCoods; + auto maybeCoords = + layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape); + if (failed(maybeCoords)) + return {}; + assert(maybeCoords.value().size() == 1 && + "Expected one set of distributed offsets"); + SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned( + rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]), + getAsOpFoldResult(origOffsets)); + newCoods = llvm::to_vector(llvm::map_range( + ofrVec, [&](OpFoldResult ofr) -> Value { return cast<Value>(ofr); })); + return newCoods; +} + +/// Pattern for distributing xegpu::LoadMatrixOp. +struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + gpu::YieldOp yield = warpOp.getTerminator(); + Operation *lastNode = yield->getPrevNode(); + auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode); + if (!matrixOp) + return failure(); + + OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) { + return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op; + }); + if (!producedByLastLoad) + return rewriter.notifyMatchFailure( + warpOp, "The last op is not xegpu::LoadMatrixOp"); + const int operandIdx = producedByLastLoad->getOperandNumber(); + + VectorType sgPayloadTy = + dyn_cast<VectorType>(matrixOp.getResult().getType()); + VectorType warpResultTy = + cast<VectorType>(warpOp.getResult(operandIdx).getType()); + if (!sgPayloadTy) + return rewriter.notifyMatchFailure( + matrixOp, "the matrix op payload must be a vector type"); + + auto loc = matrixOp.getLoc(); + auto offsets = matrixOp.getMixedOffsets(); + if (offsets.empty()) + return rewriter.notifyMatchFailure(matrixOp, + "the load op must have offsets"); + SmallVector<Value> offsetsAsValues = + vector::getAsValues(rewriter, matrixOp.getLoc(), offsets); + + auto layout = matrixOp.getLayoutAttr(); + if (!layout) + return rewriter.notifyMatchFailure( + matrixOp, "the matrix operation lacks layout attribute"); + + FailureOr<VectorType> distPayloadByWarpOpOrFailure = + getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy); + if (failed(distPayloadByWarpOpOrFailure)) + return rewriter.notifyMatchFailure( + matrixOp, "Failed to distribute matrix op payload based on layout."); + + SmallVector<Value> operands = {matrixOp.getMemDesc()}; + const unsigned offsetsStartIdx = operands.size(); + operands.append(offsetsAsValues); + + SmallVector<Type> operandTypes = llvm::to_vector( + llvm::map_range(operands, [](Value v) { return v.getType(); })); + + SmallVector<size_t> newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, operands, operandTypes, newRetIndices); + SmallVector<Value> newOperands = llvm::map_to_vector( + newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); + + SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()}; + std::fill(newConstOffsets.begin(), newConstOffsets.end(), + ShapedType::kDynamic); + DenseI64ArrayAttr newConstOffsetsAttr = + rewriter.getDenseI64ArrayAttr(newConstOffsets); + ValueRange currentOffsets = + ValueRange(newOperands).drop_front(offsetsStartIdx); + + SmallVector<Value> newCoords = currentOffsets; + rewriter.setInsertionPointAfter(newWarpOp); + + if (!matrixOp.getSubgroupBlockIoAttr()) { + newCoords = computeDistributedCoordinatesForMatrixOp( + rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(), + currentOffsets); + } + xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create( + rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure, + newOperands[0], ValueRange(newCoords), newConstOffsetsAttr, + matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{}); + // Resolve the output type and replace all uses. + rewriter.replaceAllUsesWith( + newWarpOp.getResult(operandIdx), + resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter)); + return success(); + } +}; + +/// Pattern for distributing xegpu::StoreMatrixOp. +struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + gpu::YieldOp yield = warpOp.getTerminator(); + Operation *lastNode = yield->getPrevNode(); + auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode); + if (!matrixOp) + return failure(); + + VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType()); + if (!sgPayloadTy) + return rewriter.notifyMatchFailure( + matrixOp, "the matrix op payload must be a vector type"); + + auto loc = matrixOp.getLoc(); + auto offsets = matrixOp.getMixedOffsets(); + if (offsets.empty()) + return rewriter.notifyMatchFailure(matrixOp, + "the store op must have offsets"); + SmallVector<Value> offsetsAsValues = + vector::getAsValues(rewriter, matrixOp.getLoc(), offsets); + + auto layout = matrixOp.getLayoutAttr(); + if (!layout) + return rewriter.notifyMatchFailure( + matrixOp, "the matrix operation lacks layout attribute"); + + FailureOr<VectorType> distPayloadByWarpOpOrFailure = + getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy); + if (failed(distPayloadByWarpOpOrFailure)) + return rewriter.notifyMatchFailure( + matrixOp, "Failed to distribute matrix op payload based on layout."); + + SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()}; + const unsigned offsetsStartIdx = operands.size(); + operands.append(offsetsAsValues); + + SmallVector<Type> operandTypes = llvm::to_vector( + llvm::map_range(operands, [](Value v) { return v.getType(); })); + operandTypes[0] = *distPayloadByWarpOpOrFailure; + + SmallVector<size_t> newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, operands, operandTypes, newRetIndices); + SmallVector<Value> newOperands = llvm::map_to_vector( + newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); + + SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()}; + std::fill(newConstOffsets.begin(), newConstOffsets.end(), + ShapedType::kDynamic); + DenseI64ArrayAttr newConstOffsetsAttr = + rewriter.getDenseI64ArrayAttr(newConstOffsets); + ValueRange currentOffsets = + ValueRange(newOperands).drop_front(offsetsStartIdx); + + SmallVector<Value> newCoords = currentOffsets; + rewriter.setInsertionPointAfter(newWarpOp); + + if (!matrixOp.getSubgroupBlockIoAttr()) { + newCoords = computeDistributedCoordinatesForMatrixOp( + rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(), + currentOffsets); + } + + xegpu::StoreMatrixOp::create( + rewriter, loc, TypeRange{}, newOperands[0], newOperands[1], + ValueRange(newCoords), newConstOffsetsAttr, + matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{}); + rewriter.eraseOp(matrixOp); + return success(); + } +}; + /// Distribute a scattered load op. The logic and requirements are the same as /// for the scattered store distribution. The warpOp's payload vector is /// expected to be distributed by the load's result consumer. @@ -1443,7 +1624,8 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( LoadNdDistribution, DpasDistribution, PrefetchNdDistribution, GpuBarrierDistribution, VectorMultiReductionDistribution, LoadDistribution, StoreDistribution, VectorTransposeDistribution, - VectorBitcastDistribution, + VectorBitcastDistribution, LoadMatrixDistribution, + StoreMatrixDistribution, MemrefExtractAlignedPointerAsIndexDistribution>( patterns.getContext(), /*pattern benefit=*/regularPatternBenefit); @@ -1468,6 +1650,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() { // Layouts are needed for vector type only. if (!isa<VectorType>(operand.get().getType())) continue; + if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op)) + continue; auto layout = xegpu::getDistributeLayoutAttr(operand.get()); if (!layout) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 9fc5ad9af5c7b..79eea55c8b78a 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -114,7 +114,8 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op, // Compute the list of subgroup-relative offsets for sub-tensors or sub-memory // descriptors to be accessed, based on the layout information. ArrayRef<int64_t> wgShape = op.getDataShape(); - auto maybeDescOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape); + auto maybeDescOffsets = + layout.computeDistributedCoords(rewriter, loc, sgId, wgShape); if (failed(maybeDescOffsets)) return failure(); @@ -830,8 +831,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> { // Get subgroup id Value sgId = gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); - - auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape); + auto sgOffsets = + layout.computeDistributedCoords(rewriter, loc, sgId, wgShape); if (failed(sgOffsets)) return failure(); @@ -1052,7 +1053,8 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> { Value sgId = gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); - auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape); + auto sgOffsets = + layout.computeDistributedCoords(rewriter, loc, sgId, wgShape); if (failed(sgOffsets)) return failure(); diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index ebbe3ce0ec0d0..92f353717ac59 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -451,7 +451,7 @@ func.func @store_scatter_offset_wi_1(%src: memref<?xf16>) { %offsets = arith.constant dense<[0]> : vector<1xindex> %mask = arith.constant dense<1>: vector<1xi1> // expected-error@+1 {{Mask should match value except the chunk size dim}} - xegpu.store %val, %src[%offsets], %mask + xegpu.store %val, %src[%offsets], %mask : vector<4xf16>, memref<?xf16>, vector<1xindex>, vector<1xi1> return } @@ -870,14 +870,6 @@ func.func @load_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>) { return } -// ----- -func.func @load_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>) { - // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}} - %data2 = xegpu.load_matrix %arg0[8, 8] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16> -> vector<16x16xf16> - return -} - - // ----- func.func @store_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf32>) { // expected-error@+1 {{failed to verify that all of {mem_desc, data} have same element type}} @@ -900,16 +892,25 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve } // ----- -func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) { - // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}} - xegpu.store_matrix %data, %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16> +func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>, %arg1: vector<2x16xf32>) { + // expected-error@+1 {{With subgroup_block_io, accessed data must be contiguous and coalesced}} + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : + vector<2x16xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>> return } // ----- -func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) { - // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}} - xegpu.store_matrix %data, %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16> +func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>, %arg1: vector<16x2xf32>) { + // expected-error@+1 {{With subgroup_block_io, the distributed dimensions must be contiguous}} + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} : + vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>> return } +// ----- +func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>, %arg1: vector<16x2xf32>) { + // expected-error@+1 {{With subgroup_block_io, the block shape must match the lane layout}} + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : + vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>> + return +} diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 27a3dc373c739..8946d14e80b72 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -265,3 +265,66 @@ gpu.module @xevm_module{ gpu.return } } + +// ----- +// CHECK-LABEL: gpu.func @load_store_matrix_1({{.*}}) { +// CHECK: %[[LAYOUT_X:.*]] = arith.constant 8 : index +// CHECK: %[[LAYOUT_Y:.*]] = arith.constant 2 : index +// CHECK: %[[LANE_ID:.*]] = gpu.lane_id +// CHECK: %[[DELINEARIZED_LANE_Y:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]] +// CHECK: %[[DELINEARIZED_LANE_X:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]] +// CHECK: %[[LANE_Y_OFFSET:.*]] = index.remu %[[DELINEARIZED_LANE_Y]], %[[LAYOUT_Y]] +// CHECK: %[[LANE_X_OFFSET:.*]] = index.remu %[[DELINEARIZED_LANE_X]], %[[LAYOUT_X]] +// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32> +// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index +gpu.module @xevm_module{ + gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) { + %c0 = arith.constant 0 : index + %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32> + xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @load_store_matrix_2({{.*}}) { +// CHECK: %[[DIST_UNIT_HEIGHT_X:.*]] = arith.constant 4 : index +// CHECK: %[[DIST_UNIT_HEIGHT_Y:.*]] = arith.constant 8 : index +// CHECK: %[[LANE_DATA_Y:.*]] = arith.constant 2 : index +// CHECK: %[[USER_OFFSET_X:.*]] = arith.constant 1 : index +// CHECK: %[[LANE_ID:.*]] = gpu.lane_id +// CHECK: %[[DELINEARIZED_LANE_Y:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]] +// CHECK: %[[DELINEARIZED_LANE_X:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]] +// CHECK: %[[LANE_Y_OFFSET_1:.*]] = index.mul %[[DELINEARIZED_LANE_Y]], %[[LANE_DATA_Y]] +// CHECK: %[[LANE_Y_OFFSET:.*]] = index.remu %[[LANE_Y_OFFSET_1]], %[[DIST_UNIT_HEIGHT_Y]] +// CHECK: %[[LANE_X_OFFSET_1:.*]] = index.remu %[[DELINEARIZED_LANE_X]], %[[DIST_UNIT_HEIGHT_X]] +// CHECK: %[[LANE_X_OFFSET:.*]] = index.add %[[LANE_X_OFFSET_1]], %[[USER_OFFSET_X]] +// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32> +// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index +gpu.module @xevm_module{ + gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32> + xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @load_store_matrix_3({{.*}}) { +// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>: +// CHECK-SAME: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index -> vector<1x2xf32> +// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>: +// CHECK-SAME: vector<1x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index +gpu.module @xevm_module{ + gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : + !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32> + xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : + vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index + gpu.return + } +} diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index 76d461108b296..93d51441f5b81 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -200,7 +200,8 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> { Value sgId = gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); - auto maybeOffsets = sliceAttr.getOffsets(rewriter, loc, sgId, wgShape); + auto maybeOffsets = + sliceAttr.computeDistributedCoords(rewriter, loc, sgId, wgShape); if (failed(maybeOffsets)) return failure(); From 346da3dfd3e70ca82a7df968c79666ff0b8c77c5 Mon Sep 17 00:00:00 2001 From: Kewen Meng <Kewen.Meng@amd.com> Date: Mon, 3 Nov 2025 12:50:11 -0800 Subject: [PATCH 072/313] Revert "[libc] Add printf error handling" (#166232) --- libc/src/stdio/CMakeLists.txt | 24 --------- libc/src/stdio/asprintf.cpp | 18 +------ libc/src/stdio/baremetal/CMakeLists.txt | 8 --- libc/src/stdio/baremetal/printf.cpp | 23 ++------ libc/src/stdio/baremetal/vprintf.cpp | 23 ++------ libc/src/stdio/generic/CMakeLists.txt | 4 -- libc/src/stdio/generic/fprintf.cpp | 17 +----- libc/src/stdio/generic/printf.cpp | 17 +----- libc/src/stdio/generic/vfprintf.cpp | 17 +----- libc/src/stdio/generic/vprintf.cpp | 17 +----- libc/src/stdio/printf_core/CMakeLists.txt | 25 --------- libc/src/stdio/printf_core/core_structs.h | 19 +++---- libc/src/stdio/printf_core/error_mapper.h | 21 -------- .../stdio/printf_core/generic/CMakeLists.txt | 8 --- .../stdio/printf_core/generic/error_mapper.h | 49 ----------------- .../stdio/printf_core/linux/CMakeLists.txt | 8 --- .../stdio/printf_core/linux/error_mapper.h | 54 ------------------- libc/src/stdio/printf_core/printf_main.h | 9 ++-- .../stdio/printf_core/vasprintf_internal.h | 20 ++++--- .../src/stdio/printf_core/vfprintf_internal.h | 41 +++++--------- .../stdio/printf_core/write_int_converter.h | 4 +- libc/src/stdio/printf_core/writer.h | 8 +-- libc/src/stdio/snprintf.cpp | 19 +------ libc/src/stdio/sprintf.cpp | 18 +------ libc/src/stdio/vasprintf.cpp | 16 +----- libc/src/stdio/vsnprintf.cpp | 19 +------ libc/src/stdio/vsprintf.cpp | 17 +----- libc/src/stdlib/CMakeLists.txt | 6 --- libc/src/stdlib/strfromd.cpp | 10 +--- libc/src/stdlib/strfromf.cpp | 10 +--- libc/src/stdlib/strfroml.cpp | 10 +--- libc/src/time/strftime_core/strftime_main.h | 3 +- libc/test/src/stdio/CMakeLists.txt | 2 - libc/test/src/stdio/fprintf_test.cpp | 23 -------- .../src/stdio/printf_core/converter_test.cpp | 30 +++++------ .../src/stdio/printf_core/writer_test.cpp | 32 +++++------ libc/test/src/stdio/snprintf_test.cpp | 15 ------ libc/test/src/stdio/vfprintf_test.cpp | 5 -- libc/test/src/stdlib/StrfromTest.h | 19 +------ 39 files changed, 104 insertions(+), 584 deletions(-) delete mode 100644 libc/src/stdio/printf_core/error_mapper.h delete mode 100644 libc/src/stdio/printf_core/generic/CMakeLists.txt delete mode 100644 libc/src/stdio/printf_core/generic/error_mapper.h delete mode 100644 libc/src/stdio/printf_core/linux/CMakeLists.txt delete mode 100644 libc/src/stdio/printf_core/linux/error_mapper.h diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt index c75c8b11be2b5..b0a6ef1e291b5 100644 --- a/libc/src/stdio/CMakeLists.txt +++ b/libc/src/stdio/CMakeLists.txt @@ -125,10 +125,6 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -140,10 +136,6 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -154,10 +146,6 @@ add_entrypoint_object( asprintf.h DEPENDS libc.src.stdio.printf_core.vasprintf_internal - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -169,10 +157,6 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -184,10 +168,6 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -198,10 +178,6 @@ add_entrypoint_object( vasprintf.h DEPENDS libc.src.stdio.printf_core.vasprintf_internal - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_subdirectory(printf_core) diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp index 083f40c1f19fa..f8cfb74ce48ea 100644 --- a/libc/src/stdio/asprintf.cpp +++ b/libc/src/stdio/asprintf.cpp @@ -7,12 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/stdio/asprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vasprintf_internal.h" namespace LIBC_NAMESPACE_DECL { @@ -26,18 +22,8 @@ LLVM_LIBC_FUNCTION(int, asprintf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - auto ret_val = printf_core::vasprintf_internal(buffer, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + int ret = printf_core::vasprintf_internal(buffer, format, args); + return ret; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt index bfeff0e2b5880..548938f885c94 100644 --- a/libc/src/stdio/baremetal/CMakeLists.txt +++ b/libc/src/stdio/baremetal/CMakeLists.txt @@ -29,12 +29,8 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.error_mapper - libc.src.stdio.printf_core.core_structs libc.src.__support.arg_list libc.src.__support.OSUtil.osutil - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -91,12 +87,8 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.error_mapper - libc.src.stdio.printf_core.core_structs libc.src.__support.arg_list libc.src.__support.OSUtil.osutil - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp index 0c6c9ad338c9f..7253c6549a4e4 100644 --- a/libc/src/stdio/baremetal/printf.cpp +++ b/libc/src/stdio/baremetal/printf.cpp @@ -7,13 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdio/printf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/io.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -45,25 +42,13 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) { buffer, BUFF_SIZE, &stdout_write_hook, nullptr); printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb); - auto retval = printf_core::printf_main(&writer, format, args); - if (!retval.has_value()) { - libc_errno = printf_core::internal_error_to_errno(retval.error()); - return -1; - } + int retval = printf_core::printf_main(&writer, format, args); int flushval = wb.overflow_write(""); - if (flushval != printf_core::WRITE_OK) { - libc_errno = printf_core::internal_error_to_errno(-flushval); - return -1; - } + if (flushval != printf_core::WRITE_OK) + retval = flushval; - if (retval.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(retval.value()); + return retval; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp index d2f586c70ad1c..ab02533f14911 100644 --- a/libc/src/stdio/baremetal/vprintf.cpp +++ b/libc/src/stdio/baremetal/vprintf.cpp @@ -7,13 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdio/vprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/io.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -43,25 +40,13 @@ LLVM_LIBC_FUNCTION(int, vprintf, buffer, BUFF_SIZE, &stdout_write_hook, nullptr); printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb); - auto retval = printf_core::printf_main(&writer, format, args); - if (!retval.has_value()) { - libc_errno = printf_core::internal_error_to_errno(retval.error()); - return -1; - } + int retval = printf_core::printf_main(&writer, format, args); int flushval = wb.overflow_write(""); - if (flushval != printf_core::WRITE_OK) { - libc_errno = printf_core::internal_error_to_errno(-flushval); - return -1; - } + if (flushval != printf_core::WRITE_OK) + retval = flushval; - if (retval.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(retval.value()); + return retval; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt index 71055edea3d9e..6361822b61999 100644 --- a/libc/src/stdio/generic/CMakeLists.txt +++ b/libc/src/stdio/generic/CMakeLists.txt @@ -393,11 +393,7 @@ add_generic_entrypoint_object( list(APPEND fprintf_deps libc.hdr.types.FILE libc.src.__support.arg_list - libc.src.__support.CPP.limits - libc.src.__support.libc_errno libc.src.stdio.printf_core.vfprintf_internal - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper ) if(LLVM_LIBC_FULL_BUILD) diff --git a/libc/src/stdio/generic/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp index b07f2528fe11d..087aeadfc52c5 100644 --- a/libc/src/stdio/generic/fprintf.cpp +++ b/libc/src/stdio/generic/fprintf.cpp @@ -8,12 +8,9 @@ #include "src/stdio/fprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -30,18 +27,8 @@ LLVM_LIBC_FUNCTION(int, fprintf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - auto ret_val = printf_core::vfprintf_internal(stream, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + int ret_val = printf_core::vfprintf_internal(stream, format, args); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/printf.cpp b/libc/src/stdio/generic/printf.cpp index d6d4adcefb3b1..bb7c7c86f843f 100644 --- a/libc/src/stdio/generic/printf.cpp +++ b/libc/src/stdio/generic/printf.cpp @@ -8,12 +8,9 @@ #include "src/stdio/printf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -34,19 +31,9 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) { // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - auto ret_val = printf_core::vfprintf_internal( + int ret_val = printf_core::vfprintf_internal( reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp index c00352d1dd666..01f4265f118a6 100644 --- a/libc/src/stdio/generic/vfprintf.cpp +++ b/libc/src/stdio/generic/vfprintf.cpp @@ -8,12 +8,9 @@ #include "src/stdio/vfprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -27,18 +24,8 @@ LLVM_LIBC_FUNCTION(int, vfprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - auto ret_val = printf_core::vfprintf_internal(stream, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + int ret_val = printf_core::vfprintf_internal(stream, format, args); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp index 1c0837fd5d441..08d71515646ed 100644 --- a/libc/src/stdio/generic/vprintf.cpp +++ b/libc/src/stdio/generic/vprintf.cpp @@ -8,12 +8,9 @@ #include "src/stdio/vprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -32,19 +29,9 @@ LLVM_LIBC_FUNCTION(int, vprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - auto ret_val = printf_core::vfprintf_internal( + int ret_val = printf_core::vfprintf_internal( reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt index 2d1daea71406e..ee66145e60156 100644 --- a/libc/src/stdio/printf_core/CMakeLists.txt +++ b/libc/src/stdio/printf_core/CMakeLists.txt @@ -32,17 +32,6 @@ if(printf_config_copts) list(PREPEND printf_config_copts "COMPILE_OPTIONS") endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) - add_subdirectory(${LIBC_TARGET_OS}) -else() - add_subdirectory(generic) -endif() - -set(target_error_mapper libc.src.stdio.printf_core.${LIBC_TARGET_OS}.error_mapper) -if(NOT TARGET ${target_error_converter}) - set(target_error_mapper libc.src.stdio.printf_core.generic.error_mapper) -endif() - add_header_library( printf_config HDRS @@ -58,7 +47,6 @@ add_header_library( libc.include.inttypes libc.src.__support.CPP.string_view libc.src.__support.FPUtil.fp_bits - libc.hdr.errno_macros ) add_header_library( @@ -137,7 +125,6 @@ add_header_library( .writer .core_structs libc.src.__support.arg_list - libc.src.__support.error_or ) add_header_library( @@ -149,20 +136,10 @@ add_header_library( libc.hdr.func.free libc.hdr.func.realloc libc.src.__support.arg_list - libc.src.__support.error_or libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer ) -add_header_library( - error_mapper - HDRS - error_mapper.h - DEPENDS - ${target_error_mapper} - libc.src.__support.macros.properties.architectures -) - if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD) # Not all platforms have a file implementation. If file is unvailable, and a # full build is requested, then we must skip all file based printf sections. @@ -175,10 +152,8 @@ add_header_library( vfprintf_internal.h DEPENDS libc.src.__support.File.file - libc.src.__support.error_or libc.src.__support.arg_list libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer ${use_system_file} ) - diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h index 0d41f2244d8da..e27f77b6b594a 100644 --- a/libc/src/stdio/printf_core/core_structs.h +++ b/libc/src/stdio/printf_core/core_structs.h @@ -132,17 +132,14 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() { // This is the value to be returned by conversions when no error has occurred. constexpr int WRITE_OK = 0; -// These are the error return values used by the printf engine when an -// error has occurred. They are all large negative, distinct values starting -// from -1000 to not overlap with system errors. -constexpr int FILE_WRITE_ERROR = -1001; -constexpr int FILE_STATUS_ERROR = -1002; -constexpr int NULLPTR_WRITE_ERROR = -1003; -constexpr int INT_CONVERSION_ERROR = -1004; -constexpr int FIXED_POINT_CONVERSION_ERROR = -1005; -constexpr int ALLOCATION_ERROR = -1006; -constexpr int OVERFLOW_ERROR = -1007; - +// These are the printf return values for when an error has occurred. They are +// all negative, and should be distinct. +constexpr int FILE_WRITE_ERROR = -1; +constexpr int FILE_STATUS_ERROR = -2; +constexpr int NULLPTR_WRITE_ERROR = -3; +constexpr int INT_CONVERSION_ERROR = -4; +constexpr int FIXED_POINT_CONVERSION_ERROR = -5; +constexpr int ALLOCATION_ERROR = -6; } // namespace printf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/printf_core/error_mapper.h b/libc/src/stdio/printf_core/error_mapper.h deleted file mode 100644 index 23030930133a1..0000000000000 --- a/libc/src/stdio/printf_core/error_mapper.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Error mapper for printf ---------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H -#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H - -#include "src/__support/macros/properties/architectures.h" - -// Maps internal errors to the available errnos on the platform. -#if defined(__linux__) -#include "linux/error_mapper.h" -#else -#include "generic/error_mapper.h" -#endif - -#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/generic/CMakeLists.txt b/libc/src/stdio/printf_core/generic/CMakeLists.txt deleted file mode 100644 index 2f0143d992e31..0000000000000 --- a/libc/src/stdio/printf_core/generic/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -add_header_library( - error_mapper - HDRS - error_mapper.h - DEPENDS - libc.src.stdio.printf_core.core_structs - libc.hdr.errno_macros -) diff --git a/libc/src/stdio/printf_core/generic/error_mapper.h b/libc/src/stdio/printf_core/generic/error_mapper.h deleted file mode 100644 index d8cdd2cc2dbaa..0000000000000 --- a/libc/src/stdio/printf_core/generic/error_mapper.h +++ /dev/null @@ -1,49 +0,0 @@ -//===-- Generic implementation of error mapper ------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H -#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H - -#include "hdr/errno_macros.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" - -namespace LIBC_NAMESPACE_DECL { -namespace printf_core { - -LIBC_INLINE static int internal_error_to_errno(int internal_error) { - // System error occured, return error as is. - if (internal_error < 1001 && internal_error > 0) { - return internal_error; - } - - // Map internal error to the available C standard errnos. - switch (-internal_error) { - case WRITE_OK: - return 0; - case FILE_WRITE_ERROR: - case FILE_STATUS_ERROR: - case NULLPTR_WRITE_ERROR: - case ALLOCATION_ERROR: - return EDOM; - case INT_CONVERSION_ERROR: - case FIXED_POINT_CONVERSION_ERROR: - case OVERFLOW_ERROR: - return ERANGE; - default: - LIBC_ASSERT( - false && - "Invalid internal printf error code passed to internal_error_to_errno"); - return EDOM; - } -} - -} // namespace printf_core -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/linux/CMakeLists.txt b/libc/src/stdio/printf_core/linux/CMakeLists.txt deleted file mode 100644 index 2f0143d992e31..0000000000000 --- a/libc/src/stdio/printf_core/linux/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -add_header_library( - error_mapper - HDRS - error_mapper.h - DEPENDS - libc.src.stdio.printf_core.core_structs - libc.hdr.errno_macros -) diff --git a/libc/src/stdio/printf_core/linux/error_mapper.h b/libc/src/stdio/printf_core/linux/error_mapper.h deleted file mode 100644 index 3c2fe663072d0..0000000000000 --- a/libc/src/stdio/printf_core/linux/error_mapper.h +++ /dev/null @@ -1,54 +0,0 @@ -//===-- Linux implementation of error mapper --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H -#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H - -#include "hdr/errno_macros.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" - -namespace LIBC_NAMESPACE_DECL { -namespace printf_core { - -LIBC_INLINE static int internal_error_to_errno(int internal_error) { - // System error occured, return error as is. - if (internal_error < 1001 && internal_error > 0) { - return internal_error; - } - - // Map internal error to POSIX errnos. - switch (-internal_error) { - case WRITE_OK: - return 0; - case FILE_WRITE_ERROR: - return EIO; - case FILE_STATUS_ERROR: - return EIO; - case NULLPTR_WRITE_ERROR: - return EINVAL; - case INT_CONVERSION_ERROR: - return ERANGE; - case FIXED_POINT_CONVERSION_ERROR: - return EINVAL; - case ALLOCATION_ERROR: - return ENOMEM; - case OVERFLOW_ERROR: - return EOVERFLOW; - default: - LIBC_ASSERT( - false && - "Invalid internal printf error code passed to internal_error_to_errno"); - return EINVAL; - } -} - -} // namespace printf_core -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/printf_main.h b/libc/src/stdio/printf_core/printf_main.h index 1c7a7237c097d..57f29858d5298 100644 --- a/libc/src/stdio/printf_core/printf_main.h +++ b/libc/src/stdio/printf_core/printf_main.h @@ -10,7 +10,6 @@ #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PRINTF_MAIN_H #include "src/__support/arg_list.h" -#include "src/__support/error_or.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/converter.h" #include "src/stdio/printf_core/core_structs.h" @@ -23,9 +22,8 @@ namespace LIBC_NAMESPACE_DECL { namespace printf_core { template <WriteMode write_mode> -ErrorOr<size_t> printf_main(Writer<write_mode> *writer, - const char *__restrict str, - internal::ArgList &args) { +int printf_main(Writer<write_mode> *writer, const char *__restrict str, + internal::ArgList &args) { Parser<internal::ArgList> parser(str, args); int result = 0; for (FormatSection cur_section = parser.get_next_section(); @@ -35,8 +33,9 @@ ErrorOr<size_t> printf_main(Writer<write_mode> *writer, result = convert(writer, cur_section); else result = writer->write(cur_section.raw_string); + if (result < 0) - return Error(-result); + return result; } return writer->get_chars_written(); diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h index 41df17b67f35b..283d8df2810fb 100644 --- a/libc/src/stdio/printf_core/vasprintf_internal.h +++ b/libc/src/stdio/printf_core/vasprintf_internal.h @@ -10,7 +10,6 @@ #include "hdr/func/malloc.h" #include "hdr/func/realloc.h" #include "src/__support/arg_list.h" -#include "src/__support/error_or.h" #include "src/stdio/printf_core/core_structs.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -30,7 +29,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { if (new_buff == nullptr) { if (wb->buff != wb->init_buff) free(wb->buff); - return ALLOCATION_ERROR; + return printf_core::ALLOCATION_ERROR; } if (isBuffOnStack) inline_memcpy(new_buff, wb->buff, wb->buff_cur); @@ -43,28 +42,27 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { constexpr size_t DEFAULT_BUFFER_SIZE = 200; -LIBC_INLINE ErrorOr<size_t> vasprintf_internal(char **ret, - const char *__restrict format, - internal::ArgList args) { +LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format, + internal::ArgList args) { char init_buff_on_stack[DEFAULT_BUFFER_SIZE]; printf_core::WriteBuffer<Mode<WriteMode::RESIZE_AND_FILL_BUFF>::value> wb( init_buff_on_stack, DEFAULT_BUFFER_SIZE, resize_overflow_hook); printf_core::Writer writer(wb); auto ret_val = printf_core::printf_main(&writer, format, args); - if (!ret_val.has_value()) { + if (ret_val < 0) { *ret = nullptr; - return ret_val; + return -1; } if (wb.buff == init_buff_on_stack) { - *ret = static_cast<char *>(malloc(ret_val.value() + 1)); + *ret = static_cast<char *>(malloc(ret_val + 1)); if (ret == nullptr) - return Error(ALLOCATION_ERROR); - inline_memcpy(*ret, wb.buff, ret_val.value()); + return printf_core::ALLOCATION_ERROR; + inline_memcpy(*ret, wb.buff, ret_val); } else { *ret = wb.buff; } - (*ret)[ret_val.value()] = '\0'; + (*ret)[ret_val] = '\0'; return ret_val; } } // namespace printf_core diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h index 564441d3bf51a..630de9d9d43dd 100644 --- a/libc/src/stdio/printf_core/vfprintf_internal.h +++ b/libc/src/stdio/printf_core/vfprintf_internal.h @@ -11,7 +11,6 @@ #include "src/__support/File/file.h" #include "src/__support/arg_list.h" -#include "src/__support/error_or.h" #include "src/__support/macros/attributes.h" // For LIBC_INLINE #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" @@ -36,8 +35,8 @@ LIBC_INLINE void funlockfile(FILE *f) { reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock(); } -LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, - size_t nmemb, FILE *f) { +LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, + FILE *f) { return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->write_unlocked( ptr, size * nmemb); } @@ -48,11 +47,9 @@ LIBC_INLINE void flockfile(::FILE *f) { ::flockfile(f); } LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); } -LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, - size_t nmemb, ::FILE *f) { - // Need to use system errno in this case, as system write will set this errno - // which we need to propagate back into our code. - return {::fwrite_unlocked(ptr, size, nmemb, f), errno}; +LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, + ::FILE *f) { + return ::fwrite_unlocked(ptr, size, nmemb, f); } #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace internal @@ -63,38 +60,26 @@ LIBC_INLINE int file_write_hook(cpp::string_view new_str, void *fp) { ::FILE *target_file = reinterpret_cast<::FILE *>(fp); // Write new_str to the target file. The logic preventing a zero-length write // is in the writer, so we don't check here. - auto write_result = internal::fwrite_unlocked(new_str.data(), sizeof(char), - new_str.size(), target_file); - // Propagate actual system error in FileIOResult. - if (write_result.has_error()) - return -write_result.error; - - // In case short write occured or error was not set on FileIOResult for some - // reason. - if (write_result.value != new_str.size() || - internal::ferror_unlocked(target_file)) + size_t written = internal::fwrite_unlocked(new_str.data(), sizeof(char), + new_str.size(), target_file); + if (written != new_str.size() || internal::ferror_unlocked(target_file)) return FILE_WRITE_ERROR; - return WRITE_OK; } -LIBC_INLINE ErrorOr<size_t> vfprintf_internal(::FILE *__restrict stream, - const char *__restrict format, - internal::ArgList &args) { +LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream, + const char *__restrict format, + internal::ArgList &args) { constexpr size_t BUFF_SIZE = 1024; char buffer[BUFF_SIZE]; printf_core::WriteBuffer<Mode<WriteMode::FLUSH_TO_STREAM>::value> wb( buffer, BUFF_SIZE, &file_write_hook, reinterpret_cast<void *>(stream)); Writer writer(wb); internal::flockfile(stream); - auto retval = printf_main(&writer, format, args); - if (!retval.has_value()) { - internal::funlockfile(stream); - return retval; - } + int retval = printf_main(&writer, format, args); int flushval = wb.overflow_write(""); if (flushval != WRITE_OK) - retval = Error(-flushval); + retval = flushval; internal::funlockfile(stream); return retval; } diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h index 04b2bef05bc7b..efcff278bd284 100644 --- a/libc/src/stdio/printf_core/write_int_converter.h +++ b/libc/src/stdio/printf_core/write_int_converter.h @@ -29,11 +29,11 @@ LIBC_INLINE int convert_write_int(Writer<write_mode> *writer, return NULLPTR_WRITE_ERROR; #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS - size_t written = writer->get_chars_written(); + int written = writer->get_chars_written(); switch (to_conv.length_modifier) { case LengthModifier::none: - *reinterpret_cast<int *>(to_conv.conv_val_ptr) = static_cast<int>(written); + *reinterpret_cast<int *>(to_conv.conv_val_ptr) = written; break; case LengthModifier::l: *reinterpret_cast<long *>(to_conv.conv_val_ptr) = written; diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h index 9de108ece510f..1d4734a51b9b8 100644 --- a/libc/src/stdio/printf_core/writer.h +++ b/libc/src/stdio/printf_core/writer.h @@ -127,7 +127,7 @@ template <WriteMode write_mode> struct WriteBuffer { template <WriteMode write_mode> class Writer final { WriteBuffer<write_mode> &wb; - size_t chars_written = 0; + int chars_written = 0; LIBC_INLINE int pad(char new_char, size_t length) { // First, fill as much of the buffer as possible with the padding char. @@ -161,7 +161,7 @@ template <WriteMode write_mode> class Writer final { // Takes a string, copies it into the buffer if there is space, else passes it // to the overflow mechanism to be handled separately. LIBC_INLINE int write(cpp::string_view new_string) { - chars_written += new_string.size(); + chars_written += static_cast<int>(new_string.size()); if (LIBC_LIKELY(wb.buff_cur + new_string.size() <= wb.buff_len)) { inline_memcpy(wb.buff + wb.buff_cur, new_string.data(), new_string.size()); @@ -175,7 +175,7 @@ template <WriteMode write_mode> class Writer final { // if there is space, else calls pad which will loop and call the overflow // mechanism on a secondary buffer. LIBC_INLINE int write(char new_char, size_t length) { - chars_written += length; + chars_written += static_cast<int>(length); if (LIBC_LIKELY(wb.buff_cur + length <= wb.buff_len)) { inline_memset(wb.buff + wb.buff_cur, static_cast<unsigned char>(new_char), @@ -199,7 +199,7 @@ template <WriteMode write_mode> class Writer final { return wb.overflow_write(char_string_view); } - LIBC_INLINE size_t get_chars_written() { return chars_written; } + LIBC_INLINE int get_chars_written() { return chars_written; } }; // Class-template auto deduction helpers. diff --git a/libc/src/stdio/snprintf.cpp b/libc/src/stdio/snprintf.cpp index 206c50d1b41a4..c8940862f711f 100644 --- a/libc/src/stdio/snprintf.cpp +++ b/libc/src/stdio/snprintf.cpp @@ -8,12 +8,8 @@ #include "src/stdio/snprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -36,21 +32,10 @@ LLVM_LIBC_FUNCTION(int, snprintf, wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - auto ret_val = printf_core::printf_main(&writer, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } + int ret_val = printf_core::printf_main(&writer, format, args); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - - if (ret_val.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/sprintf.cpp b/libc/src/stdio/sprintf.cpp index 9e9ecdfdf38cc..7be97d3591aaf 100644 --- a/libc/src/stdio/sprintf.cpp +++ b/libc/src/stdio/sprintf.cpp @@ -10,10 +10,7 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -36,20 +33,9 @@ LLVM_LIBC_FUNCTION(int, sprintf, wb(buffer, cpp::numeric_limits<size_t>::max()); printf_core::Writer writer(wb); - auto ret_val = printf_core::printf_main(&writer, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } + int ret_val = printf_core::printf_main(&writer, format, args); wb.buff[wb.buff_cur] = '\0'; - - if (ret_val.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp index 4bc6a5992d5c8..4a44d4a0f8842 100644 --- a/libc/src/stdio/vasprintf.cpp +++ b/libc/src/stdio/vasprintf.cpp @@ -7,11 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/stdio/vasprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vasprintf_internal.h" namespace LIBC_NAMESPACE_DECL { @@ -22,17 +18,7 @@ LLVM_LIBC_FUNCTION(int, vasprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - auto ret_val = printf_core::vasprintf_internal(ret, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - return static_cast<int>(ret_val.value()); + return printf_core::vasprintf_internal(ret, format, args); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vsnprintf.cpp b/libc/src/stdio/vsnprintf.cpp index ba3568fbe78b3..b07a2499a0dd3 100644 --- a/libc/src/stdio/vsnprintf.cpp +++ b/libc/src/stdio/vsnprintf.cpp @@ -8,12 +8,8 @@ #include "src/stdio/vsnprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -33,21 +29,10 @@ LLVM_LIBC_FUNCTION(int, vsnprintf, wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - auto ret_val = printf_core::printf_main(&writer, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } + int ret_val = printf_core::printf_main(&writer, format, args); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - - if (ret_val.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vsprintf.cpp b/libc/src/stdio/vsprintf.cpp index 65c223a442d4b..26d497be42125 100644 --- a/libc/src/stdio/vsprintf.cpp +++ b/libc/src/stdio/vsprintf.cpp @@ -10,10 +10,7 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -33,19 +30,9 @@ LLVM_LIBC_FUNCTION(int, vsprintf, wb(buffer, cpp::numeric_limits<size_t>::max()); printf_core::Writer writer(wb); - auto ret_val = printf_core::printf_main(&writer, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } + int ret_val = printf_core::printf_main(&writer, format, args); wb.buff[wb.buff_cur] = '\0'; - - if (ret_val.value() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index 1ccdcc8bec148..c464f82dcbda7 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -73,8 +73,6 @@ add_entrypoint_object( strfromf.h DEPENDS .str_from_util - libc.src.__support.CPP.limits - libc.src.stdio.printf_core.error_mapper ) add_entrypoint_object( @@ -85,8 +83,6 @@ add_entrypoint_object( strfromd.h DEPENDS .str_from_util - libc.src.__support.CPP.limits - libc.src.stdio.printf_core.error_mapper ) add_entrypoint_object( @@ -97,8 +93,6 @@ add_entrypoint_object( strfroml.h DEPENDS .str_from_util - libc.src.__support.CPP.limits - libc.src.stdio.printf_core.error_mapper ) add_header_library( diff --git a/libc/src/stdlib/strfromd.cpp b/libc/src/stdlib/strfromd.cpp index f970e22010201..f51e6d4c7f1df 100644 --- a/libc/src/stdlib/strfromd.cpp +++ b/libc/src/stdlib/strfromd.cpp @@ -7,10 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfromd.h" -#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -39,12 +36,7 @@ LLVM_LIBC_FUNCTION(int, strfromd, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - return static_cast<int>(writer.get_chars_written()); + return writer.get_chars_written(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strfromf.cpp b/libc/src/stdlib/strfromf.cpp index 55ede003134b5..14dbfdb25bab6 100644 --- a/libc/src/stdlib/strfromf.cpp +++ b/libc/src/stdlib/strfromf.cpp @@ -7,10 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfromf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -39,12 +36,7 @@ LLVM_LIBC_FUNCTION(int, strfromf, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - return static_cast<int>(writer.get_chars_written()); + return writer.get_chars_written(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strfroml.cpp b/libc/src/stdlib/strfroml.cpp index 37d18738902bc..12f22a8a2fb65 100644 --- a/libc/src/stdlib/strfroml.cpp +++ b/libc/src/stdlib/strfroml.cpp @@ -7,10 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfroml.h" -#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -44,12 +41,7 @@ LLVM_LIBC_FUNCTION(int, strfroml, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - return static_cast<int>(writer.get_chars_written()); + return writer.get_chars_written(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h index 2b136d83234cd..c7e590627094a 100644 --- a/libc/src/time/strftime_core/strftime_main.h +++ b/libc/src/time/strftime_core/strftime_main.h @@ -36,8 +36,7 @@ int strftime_main(printf_core::Writer<write_mode> *writer, return result; } - // TODO: Use ErrorOr<size_t> - return static_cast<int>(writer->get_chars_written()); + return writer->get_chars_written(); } } // namespace strftime_core diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index d71f1dff11943..eec108bc12ca5 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -186,8 +186,6 @@ add_libc_test( fprintf_test.cpp DEPENDS libc.src.stdio.fprintf - libc.test.UnitTest.ErrnoCheckingTest - libc.test.UnitTest.ErrnoSetterMatcher ${fprintf_test_deps} COMPILE_OPTIONS ${use_system_file} diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp index 7d36bd30854b8..6799323cc6ad9 100644 --- a/libc/test/src/stdio/fprintf_test.cpp +++ b/libc/test/src/stdio/fprintf_test.cpp @@ -15,9 +15,6 @@ #include "src/stdio/fprintf.h" -#include "src/__support/CPP/limits.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" namespace printf_test { @@ -34,8 +31,6 @@ using ::fread; #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace printf_test -using LlvmLibcFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - TEST(LlvmLibcFPrintfTest, WriteToFile) { const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test"); auto FILE_PATH = libc_make_test_file_path(FILENAME); @@ -83,24 +78,6 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) { written = LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); - ASSERT_ERRNO_EQ(EBADF); - - ASSERT_EQ(printf_test::fclose(file), 0); -} - -#ifndef LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS -TEST(LlvmLibcFPrintfTest, NullPtrCheck) { - const char *FILENAME = APPEND_LIBC_TEST("fprintf_nullptr.test"); - auto FILE_PATH = libc_make_test_file_path(FILENAME); - - ::FILE *file = printf_test::fopen(FILE_PATH, "w"); - ASSERT_FALSE(file == nullptr); - - int ret = - LIBC_NAMESPACE::fprintf(file, "hello %s", static_cast<int *>(nullptr)); - EXPECT_LT(ret, 0); - ASSERT_ERRNO_EQ(EINVAL); ASSERT_EQ(printf_test::fclose(file), 0); } -#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp index 2dae2a22c864c..bf088937e4104 100644 --- a/libc/test/src/stdio/printf_core/converter_test.cpp +++ b/libc/test/src/stdio/printf_core/converter_test.cpp @@ -38,7 +38,7 @@ TEST_F(LlvmLibcPrintfConverterTest, SimpleRawConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "abc"); - ASSERT_EQ(writer.get_chars_written(), size_t{3}); + ASSERT_EQ(writer.get_chars_written(), 3); } TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) { @@ -52,7 +52,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "%"); - ASSERT_EQ(writer.get_chars_written(), size_t{1}); + ASSERT_EQ(writer.get_chars_written(), 1); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) { @@ -70,7 +70,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "D"); - ASSERT_EQ(writer.get_chars_written(), size_t{1}); + ASSERT_EQ(writer.get_chars_written(), 1); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) { @@ -85,7 +85,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, " E"); - ASSERT_EQ(writer.get_chars_written(), size_t{4}); + ASSERT_EQ(writer.get_chars_written(), 4); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) { @@ -102,7 +102,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "F "); - ASSERT_EQ(writer.get_chars_written(), size_t{4}); + ASSERT_EQ(writer.get_chars_written(), 4); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { @@ -118,7 +118,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "DEF"); - ASSERT_EQ(writer.get_chars_written(), size_t{3}); + ASSERT_EQ(writer.get_chars_written(), 3); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { @@ -133,7 +133,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "456"); - ASSERT_EQ(writer.get_chars_written(), size_t{3}); + ASSERT_EQ(writer.get_chars_written(), 3); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) { @@ -148,7 +148,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "xy"); - ASSERT_EQ(writer.get_chars_written(), size_t{2}); + ASSERT_EQ(writer.get_chars_written(), 2); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) { @@ -163,7 +163,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, " 789"); - ASSERT_EQ(writer.get_chars_written(), size_t{4}); + ASSERT_EQ(writer.get_chars_written(), 4); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) { @@ -180,7 +180,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "ghi "); - ASSERT_EQ(writer.get_chars_written(), size_t{4}); + ASSERT_EQ(writer.get_chars_written(), 4); } TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) { @@ -194,7 +194,7 @@ TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "12345"); - ASSERT_EQ(writer.get_chars_written(), size_t{5}); + ASSERT_EQ(writer.get_chars_written(), 5); } TEST_F(LlvmLibcPrintfConverterTest, HexConversion) { @@ -211,7 +211,7 @@ TEST_F(LlvmLibcPrintfConverterTest, HexConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "0x00000000123456ab"); - ASSERT_EQ(writer.get_chars_written(), size_t{18}); + ASSERT_EQ(writer.get_chars_written(), 18); } TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) { @@ -225,7 +225,7 @@ TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "101010"); - ASSERT_EQ(writer.get_chars_written(), size_t{6}); + ASSERT_EQ(writer.get_chars_written(), 6); } TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) { @@ -239,7 +239,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "0x123456ab"); - ASSERT_EQ(writer.get_chars_written(), size_t{10}); + ASSERT_EQ(writer.get_chars_written(), 10); } TEST_F(LlvmLibcPrintfConverterTest, OctConversion) { @@ -253,5 +253,5 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "1234"); - ASSERT_EQ(writer.get_chars_written(), size_t{4}); + ASSERT_EQ(writer.get_chars_written(), 4); } diff --git a/libc/test/src/stdio/printf_core/writer_test.cpp b/libc/test/src/stdio/printf_core/writer_test.cpp index d263cf55aa474..d036341be7981 100644 --- a/libc/test/src/stdio/printf_core/writer_test.cpp +++ b/libc/test/src/stdio/printf_core/writer_test.cpp @@ -39,7 +39,7 @@ TEST(LlvmLibcPrintfWriterTest, Write) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abc", str); - ASSERT_EQ(writer.get_chars_written(), size_t{3}); + ASSERT_EQ(writer.get_chars_written(), 3); } TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) { @@ -53,7 +53,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abcDEF123", str); - ASSERT_EQ(writer.get_chars_written(), size_t{9}); + ASSERT_EQ(writer.get_chars_written(), 9); } TEST(LlvmLibcPrintfWriterTest, WriteChars) { @@ -66,7 +66,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteChars) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaa", str); - ASSERT_EQ(writer.get_chars_written(), size_t{3}); + ASSERT_EQ(writer.get_chars_written(), 3); } TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) { @@ -80,7 +80,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDDD111", str); - ASSERT_EQ(writer.get_chars_written(), size_t{9}); + ASSERT_EQ(writer.get_chars_written(), 9); } TEST(LlvmLibcPrintfWriterTest, WriteManyChars) { @@ -102,7 +102,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteManyChars) { "ZZZZZZZZZZ" "ZZZZZZZZZ", str); - ASSERT_EQ(writer.get_chars_written(), size_t{99}); + ASSERT_EQ(writer.get_chars_written(), 99); } TEST(LlvmLibcPrintfWriterTest, MixedWrites) { @@ -117,7 +117,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWrites) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) { @@ -129,7 +129,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abcDEF1234", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) { @@ -141,7 +141,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("1111111111", str); - ASSERT_EQ(writer.get_chars_written(), size_t{15}); + ASSERT_EQ(writer.get_chars_written(), 15); } TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) { @@ -157,7 +157,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDEF1114", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) { @@ -175,7 +175,7 @@ TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) { @@ -187,7 +187,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) { writer.write('1', 3); writer.write({"456", 3}); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } struct OutBuff { @@ -226,7 +226,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("abcDEF123456", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) { @@ -246,7 +246,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("111111111111111", str); - ASSERT_EQ(writer.get_chars_written(), size_t{15}); + ASSERT_EQ(writer.get_chars_written(), 15); } TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) { @@ -269,7 +269,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) { @@ -292,7 +292,7 @@ TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) { @@ -312,7 +312,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) { wb.overflow_write(""); str[out_buff.cur_pos] = '\0'; - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); ASSERT_STREQ("aaaDEF111456", str); } diff --git a/libc/test/src/stdio/snprintf_test.cpp b/libc/test/src/stdio/snprintf_test.cpp index 1062f952d7429..baaa664cdc9ee 100644 --- a/libc/test/src/stdio/snprintf_test.cpp +++ b/libc/test/src/stdio/snprintf_test.cpp @@ -8,12 +8,8 @@ #include "src/stdio/snprintf.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -using LlvmLibcSNPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - // The sprintf test cases cover testing the shared printf functionality, so // these tests will focus on snprintf exclusive features. @@ -63,14 +59,3 @@ TEST(LlvmLibcSNPrintfTest, NoCutOff) { EXPECT_EQ(written, 10); ASSERT_STREQ(buff, "1234567890"); } - -TEST(LlvmLibcSNPrintfTest, CharsWrittenOverflow) { - char buff[0]; - - // Trigger an overflow in the return value of snprintf by writing more than - // INT_MAX bytes. - int int_max = LIBC_NAMESPACE::cpp::numeric_limits<int>::max(); - int written = LIBC_NAMESPACE::snprintf(buff, 0, "%*stest", int_max, ""); - EXPECT_LT(written, 0); - ASSERT_ERRNO_EQ(EOVERFLOW); -} diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp index 9b5f09db8fd41..f50565a0f68ca 100644 --- a/libc/test/src/stdio/vfprintf_test.cpp +++ b/libc/test/src/stdio/vfprintf_test.cpp @@ -19,8 +19,6 @@ #include "src/stdio/vfprintf.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" namespace printf_test { @@ -46,8 +44,6 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format, return ret; } -using LlvmLibcVFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - TEST(LlvmLibcVFPrintfTest, WriteToFile) { const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test"); auto FILE_PATH = libc_make_test_file_path(FILENAME); @@ -94,7 +90,6 @@ TEST(LlvmLibcVFPrintfTest, WriteToFile) { written = call_vfprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); - ASSERT_ERRNO_EQ(EBADF); ASSERT_EQ(printf_test::fclose(file), 0); } diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h index fdeed0e3c06f5..e82c94499aa11 100644 --- a/libc/test/src/stdlib/StrfromTest.h +++ b/libc/test/src/stdlib/StrfromTest.h @@ -8,8 +8,6 @@ #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/FPBits.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #define ASSERT_STREQ_LEN(actual_written, actual_str, expected_str) \ @@ -17,7 +15,7 @@ EXPECT_STREQ(actual_str, expected_str); template <typename InputT> -class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { +class StrfromTest : public LIBC_NAMESPACE::testing::Test { static constexpr bool is_single_prec = LIBC_NAMESPACE::cpp::is_same<InputT, float>::value; @@ -483,16 +481,6 @@ class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { written = func(buff, 10, "%A", -ld_nan); ASSERT_STREQ_LEN(written, buff, "-NAN"); } - - void charsWrittenOverflow(FunctionT func) { - char buff[100]; - // Trigger an overflow in the return value of strfrom by writing more than - // INT_MAX bytes. - int result = func(buff, sizeof(buff), "%.2147483647f", 1.0f); - - EXPECT_LT(result, 0); - ASSERT_ERRNO_EQ(EOVERFLOW); - } }; #define STRFROM_TEST(InputType, name, func) \ @@ -513,7 +501,4 @@ class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { TEST_F(LlvmLibc##name##Test, InsufficientBufferSize) { \ insufficentBufsize(func); \ } \ - TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); } \ - TEST_F(LlvmLibc##name##Test, CharsWrittenOverflow) { \ - charsWrittenOverflow(func); \ - } + TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); } From 96f093770d21b8bbe05a6b8ff744ce9fdd6e4c50 Mon Sep 17 00:00:00 2001 From: Roy Shi <royitaqi@users.noreply.github.com> Date: Mon, 3 Nov 2025 13:04:34 -0800 Subject: [PATCH 073/313] [dsymutil] Add option to copy swiftmodules built from interface (#165293) The default behavior is to _not_ copy such swiftmodules into the dSYM, as perviously implemented in 96f95c9d89d8a1784d3865fa941fb1c510f4e2d7. This patch adds the option to override the behavior, so that such swiftmodules can be copied into the dSYM. This is useful when the dSYM will be used on a machine which has a different Xcode/SDK than where the swiftmodules were built. Without this, when LLDB is asked to "p/po" a Swift variable, the underlying Swift compiler code would rebuild the dependent `.swiftmodule` files of the Swift stdlibs, which takes ~1 minute in some cases. See PR for tests. --- llvm/docs/CommandGuide/dsymutil.rst | 8 +++++ .../swiftmodule-include-from-interface.test | 33 +++++++++++++++++++ llvm/test/tools/dsymutil/cmdline.test | 1 + llvm/tools/dsymutil/DwarfLinkerForBinary.cpp | 7 ++-- llvm/tools/dsymutil/LinkUtils.h | 7 ++++ llvm/tools/dsymutil/Options.td | 8 +++++ llvm/tools/dsymutil/dsymutil.cpp | 3 ++ 7 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst index 8e61e01d7d9c3..0e442d657e987 100644 --- a/llvm/docs/CommandGuide/dsymutil.rst +++ b/llvm/docs/CommandGuide/dsymutil.rst @@ -70,6 +70,14 @@ OPTIONS Print this help output. +.. option:: --include-swiftmodules-from-interface + + Whether or not to copy binary swiftmodules built from textual .swiftinterface + files into the dSYM bundle. These typically come only from the SDK (since + textual interfaces require library evolution) and thus are a waste of space to + copy into the bundle. Turn this on if the swiftmodules are different from + those in the SDK. + .. option:: --keep-function-for-static Make a static variable keep the enclosing function even if it would have been diff --git a/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test new file mode 100644 index 0000000000000..00141f12587d4 --- /dev/null +++ b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test @@ -0,0 +1,33 @@ +# RUN: dsymutil -include-swiftmodules-from-interface -verbose -oso-prepend-path=%p -y -o %t.dSYM %s | FileCheck %s +# +# RUN: dsymutil -include-swiftmodules-from-interface --linker parallel -verbose -oso-prepend-path=%p -y %s -o %t-parallel.dSYM | FileCheck %s +# +# To regenerate: +# echo ''>I.swift +# echo ''>B.swift +# echo 'import I'>main.swift +# xcrun swiftc -emit-module-interface-path I.swiftinterface -enable-library-evolution I.swift +# xcrun swiftc -emit-module-path B.swiftmodule B.swift -Xfrontend -no-serialize-debugging-options +# xcrun swiftc -explicit-module-build main.swift -I. -module-cache-path cache -g -Xfrontend -no-serialize-debugging-options +# output is "B.swiftmodule" and "cache/I*.swiftmodule" +# +# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/Binary.swiftmodule +# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/FromInterface.swiftmodule + +# +--- +triple: 'arm64-apple-darwin' +objects: + - filename: '../Inputs/Binary.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] + - filename: '../Inputs/FromInterface.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] + - filename: '../Inputs/FromInterface.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] +... diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test index 1574fe35f5254..0b0bce194d575 100644 --- a/llvm/test/tools/dsymutil/cmdline.test +++ b/llvm/test/tools/dsymutil/cmdline.test @@ -14,6 +14,7 @@ CHECK: -fat64 CHECK: -flat CHECK: -gen-reproducer CHECK: -help +CHECK: -include-swiftmodules-from-interface CHECK: -keep-function-for-static CHECK: -no-object-timestamp CHECK: -no-odr diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp index b91c27e6a0f86..ee1e9060657b0 100644 --- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp +++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp @@ -794,9 +794,10 @@ bool DwarfLinkerForBinary::linkImpl( reportWarning("Could not parse binary Swift module: " + toString(FromInterfaceOrErr.takeError()), Obj->getObjectFilename()); - // Only skip swiftmodules that could be parsed and are - // positively identified as textual. - } else if (*FromInterfaceOrErr) { + // Only skip swiftmodules that could be parsed and are positively + // identified as textual. Do so only when the option allows. + } else if (*FromInterfaceOrErr && + !Options.IncludeSwiftModulesFromInterface) { if (Options.Verbose) outs() << "Skipping compiled textual Swift interface: " << Obj->getObjectFilename() << "\n"; diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h index ad5515a04333e..c333a3d4afee0 100644 --- a/llvm/tools/dsymutil/LinkUtils.h +++ b/llvm/tools/dsymutil/LinkUtils.h @@ -114,6 +114,13 @@ struct LinkOptions { /// Whether all remarks should be kept or only remarks with valid debug /// locations. bool RemarksKeepAll = true; + + /// Whether or not to copy binary swiftmodules built from textual + /// .swiftinterface files into the dSYM bundle. These typically come only + /// from the SDK (since textual interfaces require library evolution) and + /// thus are a waste of space to copy into the bundle. Turn this on if the + /// swiftmodules are different from those in the SDK. + bool IncludeSwiftModulesFromInterface = false; /// @} LinkOptions() = default; diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td index ad35e55e33b12..e99bc12fa7fd8 100644 --- a/llvm/tools/dsymutil/Options.td +++ b/llvm/tools/dsymutil/Options.td @@ -202,6 +202,14 @@ def remarks_drop_without_debug: Flag<["--", "-"], "remarks-drop-without-debug">, "all remarks are kept.">, Group<grp_general>; +def include_swiftmodules_from_interface: Flag<["--", "-"], "include-swiftmodules-from-interface">, + HelpText<"Whether or not to copy binary swiftmodules built from textual " + ".swiftinterface files into the dSYM bundle. These typically come only " + "from the SDK (since textual interfaces require library evolution) and " + "thus are a waste of space to copy into the bundle. Turn this on if the " + "swiftmodules are different from those in the SDK.">, + Group<grp_general>; + def linker: Separate<["--", "-"], "linker">, MetaVarName<"<DWARF linker type>">, HelpText<"Specify the desired type of DWARF linker. Defaults to 'classic'">, diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp index 913077eb0b06d..688f6aaf3d0c9 100644 --- a/llvm/tools/dsymutil/dsymutil.cpp +++ b/llvm/tools/dsymutil/dsymutil.cpp @@ -391,6 +391,9 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) { Options.LinkOpts.RemarksKeepAll = !Args.hasArg(OPT_remarks_drop_without_debug); + Options.LinkOpts.IncludeSwiftModulesFromInterface = + Args.hasArg(OPT_include_swiftmodules_from_interface); + if (opt::Arg *BuildVariantSuffix = Args.getLastArg(OPT_build_variant_suffix)) Options.LinkOpts.BuildVariantSuffix = BuildVariantSuffix->getValue(); From 9ff31be2f2c181d81bbafa927e6aca117fd68330 Mon Sep 17 00:00:00 2001 From: Michael Spencer <bigcheesegs@gmail.com> Date: Mon, 3 Nov 2025 13:12:55 -0800 Subject: [PATCH 074/313] [clang][builtins] Add stdckdint.h to the modulemap. (#166230) All builtin Clang headers need to be covered by the modulemap. This fixes https://github.com/llvm/llvm-project/issues/166173 --- clang/lib/Headers/module.modulemap | 5 +++++ clang/lib/Lex/ModuleMap.cpp | 1 + .../Modules/Inputs/builtin-headers/system-modules.modulemap | 5 +++++ clang/test/Modules/builtin-headers.mm | 1 + 4 files changed, 12 insertions(+) diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap index 2e4d533356569..c13dd3fd48ac8 100644 --- a/clang/lib/Headers/module.modulemap +++ b/clang/lib/Headers/module.modulemap @@ -253,6 +253,11 @@ module _Builtin_stdbool [system] { export * } +module _Builtin_stdckdint [system] { + header "stdckdint.h" + export * +} + module _Builtin_stdcountof [system] { header "stdcountof.h" export * diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index 637a08fe4dcdb..b8202ea11be36 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -258,6 +258,7 @@ static bool isBuiltinHeaderName(StringRef FileName) { .Case("stdarg.h", true) .Case("stdatomic.h", true) .Case("stdbool.h", true) + .Case("stdckdint.h", true) .Case("stdcountof.h", true) .Case("stddef.h", true) .Case("stdint.h", true) diff --git a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap index 186965177caaf..8ab6ae4779ea9 100644 --- a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap +++ b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap @@ -49,6 +49,11 @@ module cstd [system] [no_undeclared_includes] { export * } + module stdckdint { + header "stdckdint.h" + export * + } + module stdcountof { header "stdcountof.h" export * diff --git a/clang/test/Modules/builtin-headers.mm b/clang/test/Modules/builtin-headers.mm index ad2d66ae38dfd..6cd366228172e 100644 --- a/clang/test/Modules/builtin-headers.mm +++ b/clang/test/Modules/builtin-headers.mm @@ -17,6 +17,7 @@ @import _Builtin_stdarg; @import _Builtin_stdatomic; @import _Builtin_stdbool; +@import _Builtin_stdckdint; @import _Builtin_stdcountof; @import _Builtin_stddef; @import _Builtin_stdint; From e5d9644bca214587a22401693eee90b2934d5827 Mon Sep 17 00:00:00 2001 From: "Deric C." <cheung.deric@gmail.com> Date: Mon, 3 Nov 2025 13:13:11 -0800 Subject: [PATCH 075/313] [NFC] [DirectX] Fix warning about parentheses for assertion in DXContainerGlobals.cpp (#166231) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR fixes the appearance of the following warning message when building LLVM with clang (21.1.2) ``` [48/100] Building CXX object lib/Target/DirectX/CMakeFiles/LLVMDirectXCodeGen.dir/DXContainerGlobals.cpp.o In file included from /nix/store/ffrg0560kj0066s4k9pznjand907nlnz-gcc-14.3.0/include/c++/14.3.0/cassert:44, from /workspace/llvm-project/llvm/include/llvm/Support/Endian.h:19, from /workspace/llvm-project/llvm/include/llvm/Support/MD5.h:33, from /workspace/llvm-project/llvm/lib/Target/DirectX/DXContainerGlobals.cpp:28: /workspace/llvm-project/llvm/lib/Target/DirectX/DXContainerGlobals.cpp: In lambda function: /workspace/llvm-project/llvm/lib/Target/DirectX/DXContainerGlobals.cpp:198:78: warning: suggest parentheses around ‘&&’ within ‘||’ [-Wparentheses] 198 | (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~ 199 | "Resource range is too large"); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``` I marked this PR as an NFC because it only modifies an assertion condition to remove a compiler warning. --- llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 8ace2d2777c74..eb4c8846441a2 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -194,9 +194,10 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) { dxbc::PSV::v2::ResourceBindInfo BindInfo; BindInfo.Type = Type; BindInfo.LowerBound = Binding.LowerBound; - assert(Binding.Size == UINT32_MAX || - (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX && - "Resource range is too large"); + assert( + (Binding.Size == UINT32_MAX || + (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX) && + "Resource range is too large"); BindInfo.UpperBound = (Binding.Size == UINT32_MAX) ? UINT32_MAX : Binding.LowerBound + Binding.Size - 1; From 25da15f9b1f910cb5d350381c1c8a287cbe4cf0f Mon Sep 17 00:00:00 2001 From: Julian Lettner <yln@users.noreply.github.com> Date: Mon, 3 Nov 2025 13:29:34 -0800 Subject: [PATCH 076/313] [lldb] Fix indentation when printing stop hooks (#165945) This commit aggregates the following changes: 1. Fix the format (i.e., indentation) when printing stop hooks via `target stop-hook list`. 2. Add `IndentScope Stream::MakeIndentScope()` to make managing (and restoring!) of the indentation level on `Stream` instances more ergonomic and less error prone. 3. Simplify printing of stop hooks using the new `IndentScope`. --- lldb/include/lldb/Utility/Stream.h | 26 ++++++++++---- lldb/source/Target/Target.cpp | 19 ++++------ lldb/source/Utility/Stream.cpp | 8 +++++ .../StopHook/stop-hook-list-format.test | 36 +++++++++++++++++++ 4 files changed, 70 insertions(+), 19 deletions(-) create mode 100644 lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h index 82774d56922a9..13455552131da 100644 --- a/lldb/include/lldb/Utility/Stream.h +++ b/lldb/include/lldb/Utility/Stream.h @@ -300,6 +300,12 @@ class Stream { /// The current indentation level. unsigned GetIndentLevel() const; + /// Set the current indentation level. + /// + /// \param[in] level + /// The new indentation level. + void SetIndentLevel(unsigned level); + /// Indent the current line in the stream. /// /// Indent the current line using the current indentation level and print an @@ -315,6 +321,20 @@ class Stream { /// Increment the current indentation level. void IndentMore(unsigned amount = 2); + struct IndentScope { + IndentScope(Stream &stream) + : m_stream(stream), m_original_indent_level(stream.GetIndentLevel()) {} + ~IndentScope() { m_stream.SetIndentLevel(m_original_indent_level); } + + private: + Stream &m_stream; + unsigned m_original_indent_level; + }; + + /// Create an indentation scope that restores the original indent level when + /// the object goes out of scope (RAII). + IndentScope MakeIndentScope(unsigned indent_amount = 2); + /// Output an offset value. /// /// Put an offset \a uval out to the stream using the printf format in \a @@ -364,12 +384,6 @@ class Stream { /// address and pointer values. void SetAddressByteSize(uint32_t addr_size); - /// Set the current indentation level. - /// - /// \param[in] level - /// The new indentation level. - void SetIndentLevel(unsigned level); - /// Output a SLEB128 number to the stream. /// /// Put an SLEB128 \a uval out to the stream using the printf format in \a diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 1e43094421f0a..a23091ad09c6d 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -3962,9 +3962,7 @@ void Target::StopHook::GetDescription(Stream &s, return; } - unsigned indent_level = s.GetIndentLevel(); - - s.SetIndentLevel(indent_level + 2); + auto indent_scope = s.MakeIndentScope(); s.Printf("Hook: %" PRIu64 "\n", GetID()); if (m_active) @@ -3978,19 +3976,17 @@ void Target::StopHook::GetDescription(Stream &s, if (m_specifier_sp) { s.Indent(); s.PutCString("Specifier:\n"); - s.SetIndentLevel(indent_level + 4); + auto indent_scope = s.MakeIndentScope(); m_specifier_sp->GetDescription(&s, level); - s.SetIndentLevel(indent_level + 2); } if (m_thread_spec_up) { StreamString tmp; s.Indent("Thread:\n"); m_thread_spec_up->GetDescription(&tmp, level); - s.SetIndentLevel(indent_level + 4); + auto indent_scope = s.MakeIndentScope(); s.Indent(tmp.GetString()); s.PutCString("\n"); - s.SetIndentLevel(indent_level + 2); } GetSubclassDescription(s, level); } @@ -4003,14 +3999,13 @@ void Target::StopHookCommandLine::GetSubclassDescription( s.PutCString(m_commands.GetStringAtIndex(0)); return; } - s.Indent("Commands: \n"); - s.SetIndentLevel(s.GetIndentLevel() + 4); + s.Indent("Commands:\n"); + auto indent_scope = s.MakeIndentScope(4); uint32_t num_commands = m_commands.GetSize(); for (uint32_t i = 0; i < num_commands; i++) { s.Indent(m_commands.GetStringAtIndex(i)); s.PutCString("\n"); } - s.SetIndentLevel(s.GetIndentLevel() - 4); } // Target::StopHookCommandLine @@ -4145,7 +4140,7 @@ void Target::StopHookScripted::GetSubclassDescription( return; s.Indent("Args:\n"); - s.SetIndentLevel(s.GetIndentLevel() + 4); + auto indent_scope = s.MakeIndentScope(4); auto print_one_element = [&s](llvm::StringRef key, StructuredData::Object *object) { @@ -4155,8 +4150,6 @@ void Target::StopHookScripted::GetSubclassDescription( }; as_dict->ForEach(print_one_element); - - s.SetIndentLevel(s.GetIndentLevel() - 4); } static constexpr OptionEnumValueElement g_dynamic_value_types[] = { diff --git a/lldb/source/Utility/Stream.cpp b/lldb/source/Utility/Stream.cpp index 89dce9fb0e1f7..e9632c3e1fc1f 100644 --- a/lldb/source/Utility/Stream.cpp +++ b/lldb/source/Utility/Stream.cpp @@ -202,6 +202,14 @@ void Stream::IndentLess(unsigned amount) { m_indent_level = 0; } +// Create an indentation scope that restores the original indent level when the +// object goes out of scope (RAII). +Stream::IndentScope Stream::MakeIndentScope(unsigned indent_amount) { + IndentScope indent_scope(*this); + IndentMore(indent_amount); + return indent_scope; +} + // Get the address size in bytes uint32_t Stream::GetAddressByteSize() const { return m_addr_size; } diff --git a/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test new file mode 100644 index 0000000000000..a9557801cc134 --- /dev/null +++ b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test @@ -0,0 +1,36 @@ +# Test format (e.g., indentation) when printing the list of stop hooks. +# +# RUN: %lldb -b -s %s | FileCheck %s --match-full-lines --strict-whitespace + +# Create some stop hooks +target stop-hook add -o 'print "Hello"' --auto-continue true --at-initial-stop true +target stop-hook add -o 'print "world,"' -o 'print "nice"' --file 'my_file' +target stop-hook add -o 'print "weather!"' --classname 'MyClass' --thread-name 'my_thread' + +# Print hooks +target stop-hook list + +# CHECK:(lldb) target stop-hook list +# CHECK:Hook: 1 +# CHECK: State: enabled +# CHECK: AutoContinue on +# CHECK: Commands: +# CHECK: print "Hello" +# CHECK-EMPTY: +# CHECK:Hook: 2 +# CHECK: State: enabled +# CHECK: Specifier: +# CHECK: File: my_file. +# CHECK: Commands: +# CHECK: print "world," +# CHECK: print "nice" +# CHECK-EMPTY: +# CHECK:Hook: 3 +# CHECK: State: enabled +# CHECK: Specifier: +# CHECK: Class name: MyClass. +# CHECK: Thread: +# CHECK: thread name: "my_thread" +# CHECK: Commands: +# CHECK: print "weather!" +# CHECK-EMPTY: From 6fe3eccdf44fc8adb46e78e65edadd57926d2fb6 Mon Sep 17 00:00:00 2001 From: Laxman Sole <lsole@nvidia.com> Date: Mon, 3 Nov 2025 13:34:44 -0800 Subject: [PATCH 077/313] [llvm][DebugInfo] Emit 0/1 for constant boolean values (#151225) Previously, sign-extending a 1-bit boolean operand in `#DBG_VALUE` would convert `true` to -1 (i.e., 0xffffffffffffffff). However, DWARF treats booleans as unsigned values, so this resulted in the attribute `DW_AT_const_value(0xffffffffffffffff)` being emitted. As a result, the debugger would display the value as `255` instead of `true`. This change modifies the behavior to use zero-extension for 1-bit values instead, ensuring that `true` is represented as 1. Consequently, the DWARF attribute emitted is now `DW_AT_const_value(1)`, which allows the debugger to correctly display the boolean as `true`. --- .../CodeGen/GlobalISel/MachineIRBuilder.cpp | 4 ++- .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 2 ++ llvm/lib/Transforms/Utils/Local.cpp | 6 +++- llvm/test/DebugInfo/debug-bool-const-value.ll | 29 +++++++++++++++++++ llvm/unittests/Transforms/Utils/LocalTest.cpp | 2 +- 5 files changed, 40 insertions(+), 3 deletions(-) create mode 100644 llvm/test/DebugInfo/debug-bool-const-value.ll diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 4b4df98024f4a..637acd61c8a5f 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -109,8 +109,10 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C, if (auto *CI = dyn_cast<ConstantInt>(NumericConstant)) { if (CI->getBitWidth() > 64) MIB.addCImm(CI); - else + else if (CI->getBitWidth() == 1) MIB.addImm(CI->getZExtValue()); + else + MIB.addImm(CI->getSExtValue()); } else if (auto *CFP = dyn_cast<ConstantFP>(NumericConstant)) { MIB.addFPImm(CFP); } else if (isa<ConstantPointerNull>(NumericConstant)) { diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index bb10cf687db8d..d84c3fb05bb24 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -733,6 +733,8 @@ MachineOperand GetMOForConstDbgOp(const SDDbgOperand &Op) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) { if (CI->getBitWidth() > 64) return MachineOperand::CreateCImm(CI); + if (CI->getBitWidth() == 1) + return MachineOperand::CreateImm(CI->getZExtValue()); return MachineOperand::CreateImm(CI->getSExtValue()); } if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 46f29030ddb05..a03cf6e953e35 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3416,7 +3416,11 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C, // Create integer constant expression. auto createIntegerExpression = [&DIB](const Constant &CV) -> DIExpression * { const APInt &API = cast<ConstantInt>(&CV)->getValue(); - std::optional<int64_t> InitIntOpt = API.trySExtValue(); + std::optional<int64_t> InitIntOpt; + if (API.getBitWidth() == 1) + InitIntOpt = API.tryZExtValue(); + else + InitIntOpt = API.trySExtValue(); return InitIntOpt ? DIB.createConstantValueExpression( static_cast<uint64_t>(*InitIntOpt)) : nullptr; diff --git a/llvm/test/DebugInfo/debug-bool-const-value.ll b/llvm/test/DebugInfo/debug-bool-const-value.ll new file mode 100644 index 0000000000000..84cf993cf4aae --- /dev/null +++ b/llvm/test/DebugInfo/debug-bool-const-value.ll @@ -0,0 +1,29 @@ +; REQUIRES: object-emission +; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s + +; CHECK: {{.*}}DW_TAG_variable +; CHECK-NEXT: {{.*}} DW_AT_const_value (1) +; CHECK-NEXT: {{.*}} DW_AT_name ("arg") + +define void @test() !dbg !5 +{ +entry: + call void @"llvm.dbg.value"(metadata i1 true, metadata !7, metadata !8), !dbg !6 + ret void, !dbg !6 +} + +declare void @"llvm.dbg.value"(metadata %".1", metadata %".2", metadata %".3") + +!llvm.dbg.cu = !{ !2 } +!llvm.module.flags = !{ !9, !10 } + +!1 = !DIFile(directory: "", filename: "test") +!2 = distinct !DICompileUnit(emissionKind: FullDebug, file: !1, isOptimized: false, language: DW_LANG_C_plus_plus, runtimeVersion: 0) +!3 = !DIBasicType(encoding: DW_ATE_boolean, name: "bool", size: 8) +!4 = !DISubroutineType(types: !{null}) +!5 = distinct !DISubprogram(file: !1, isDefinition: true, isLocal: false, isOptimized: false, line: 5, linkageName: "test", name: "test", scope: !1, scopeLine: 5, type: !4, unit: !2) +!6 = !DILocation(column: 1, line: 5, scope: !5) +!7 = !DILocalVariable(arg: 0, file: !1, line: 5, name: "arg", scope: !5, type: !3) +!8 = !DIExpression() +!9 = !{ i32 2, !"Dwarf Version", i32 4 } +!10 = !{ i32 2, !"Debug Info Version", i32 3 } diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp index 4b53cc3d6e516..4908eda16e002 100644 --- a/llvm/unittests/Transforms/Utils/LocalTest.cpp +++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp @@ -1153,7 +1153,7 @@ TEST(Local, ExpressionForConstant) { IntegerType *Int1Ty = Type::getInt1Ty(Context); Expr = createExpression(ConstantInt::getTrue(Context), Int1Ty); EXPECT_NE(Expr, nullptr); - EXPECT_EQ(Expr->getElement(1), 18446744073709551615U); + EXPECT_EQ(Expr->getElement(1), 1U); Expr = createExpression(ConstantInt::getFalse(Context), Int1Ty); EXPECT_NE(Expr, nullptr); From 1621486d676f0ebfd6e64b6e93e31dcffa27c5ec Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere <jonas@devlieghere.com> Date: Mon, 3 Nov 2025 13:37:39 -0800 Subject: [PATCH 078/313] [lldb-dap] Add support for launching supported clients (#165941) Support launching a supported DAP client using the lldb-dap binary. Currently, only the official LLDB-DAP Visual Studio Code extension is supported. It uses the VS Code launch URL format. Here's an example: ``` lldb-dap --client vscode -- /path/to/exe foo bar ``` This will open the following URL with `code --open-url`: ``` vscode://llvm-vs-code-extensions.lldb-dap/start?program=/path/to/exe&args=foo&arg=bar ``` Fixes #125777 --- lldb/test/Shell/DAP/TestClientLauncher.test | 2 + lldb/tools/lldb-dap/CMakeLists.txt | 1 + lldb/tools/lldb-dap/ClientLauncher.cpp | 74 +++++++++++++++++++++ lldb/tools/lldb-dap/ClientLauncher.h | 50 ++++++++++++++ lldb/tools/lldb-dap/tool/Options.td | 8 +++ lldb/tools/lldb-dap/tool/lldb-dap.cpp | 38 +++++++++++ lldb/unittests/DAP/CMakeLists.txt | 1 + lldb/unittests/DAP/ClientLauncherTest.cpp | 71 ++++++++++++++++++++ 8 files changed, 245 insertions(+) create mode 100644 lldb/test/Shell/DAP/TestClientLauncher.test create mode 100644 lldb/tools/lldb-dap/ClientLauncher.cpp create mode 100644 lldb/tools/lldb-dap/ClientLauncher.h create mode 100644 lldb/unittests/DAP/ClientLauncherTest.cpp diff --git a/lldb/test/Shell/DAP/TestClientLauncher.test b/lldb/test/Shell/DAP/TestClientLauncher.test new file mode 100644 index 0000000000000..a79a940da5a98 --- /dev/null +++ b/lldb/test/Shell/DAP/TestClientLauncher.test @@ -0,0 +1,2 @@ +# RUN: lldb-dap --client vscode-url -- /path/to/foo | FileCheck %s +# CHECK: vscode://llvm-vs-code-extensions.lldb-dap/start?program=%2Fpath%2Fto%2Ffoo diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt index dd1bbbdddfc59..fa940b7b73943 100644 --- a/lldb/tools/lldb-dap/CMakeLists.txt +++ b/lldb/tools/lldb-dap/CMakeLists.txt @@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS Support) add_lldb_library(lldbDAP Breakpoint.cpp BreakpointBase.cpp + ClientLauncher.cpp CommandPlugins.cpp DAP.cpp DAPError.cpp diff --git a/lldb/tools/lldb-dap/ClientLauncher.cpp b/lldb/tools/lldb-dap/ClientLauncher.cpp new file mode 100644 index 0000000000000..4cac1d6346441 --- /dev/null +++ b/lldb/tools/lldb-dap/ClientLauncher.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ClientLauncher.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/FormatVariadic.h" + +using namespace lldb_dap; + +std::optional<ClientLauncher::Client> +ClientLauncher::GetClientFrom(llvm::StringRef str) { + return llvm::StringSwitch<std::optional<ClientLauncher::Client>>(str.lower()) + .Case("vscode", ClientLauncher::VSCode) + .Case("vscode-url", ClientLauncher::VSCodeURL) + .Default(std::nullopt); +} + +std::unique_ptr<ClientLauncher> +ClientLauncher::GetLauncher(ClientLauncher::Client client) { + switch (client) { + case ClientLauncher::VSCode: + return std::make_unique<VSCodeLauncher>(); + case ClientLauncher::VSCodeURL: + return std::make_unique<VSCodeURLPrinter>(); + } + return nullptr; +} + +std::string VSCodeLauncher::URLEncode(llvm::StringRef str) { + std::string out; + llvm::raw_string_ostream os(out); + for (char c : str) { + if (std::isalnum(c) || llvm::StringRef("-_.~").contains(c)) + os << c; + else + os << '%' << llvm::utohexstr(c, false, 2); + } + return os.str(); +} + +std::string +VSCodeLauncher::GetLaunchURL(const std::vector<llvm::StringRef> args) const { + assert(!args.empty() && "empty launch args"); + + std::vector<std::string> encoded_launch_args; + for (llvm::StringRef arg : args) + encoded_launch_args.push_back(URLEncode(arg)); + + const std::string args_str = llvm::join(encoded_launch_args, "&args="); + return llvm::formatv( + "vscode://llvm-vs-code-extensions.lldb-dap/start?program={0}", + args_str) + .str(); +} + +llvm::Error VSCodeLauncher::Launch(const std::vector<llvm::StringRef> args) { + const std::string launch_url = GetLaunchURL(args); + const std::string command = + llvm::formatv("code --open-url {0}", launch_url).str(); + + std::system(command.c_str()); + return llvm::Error::success(); +} + +llvm::Error VSCodeURLPrinter::Launch(const std::vector<llvm::StringRef> args) { + llvm::outs() << GetLaunchURL(args) << '\n'; + return llvm::Error::success(); +} diff --git a/lldb/tools/lldb-dap/ClientLauncher.h b/lldb/tools/lldb-dap/ClientLauncher.h new file mode 100644 index 0000000000000..780b178d2d6ef --- /dev/null +++ b/lldb/tools/lldb-dap/ClientLauncher.h @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H +#define LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include <vector> + +namespace lldb_dap { + +class ClientLauncher { +public: + enum Client { + VSCode, + VSCodeURL, + }; + + virtual ~ClientLauncher() = default; + virtual llvm::Error Launch(const std::vector<llvm::StringRef> args) = 0; + + static std::optional<Client> GetClientFrom(llvm::StringRef str); + static std::unique_ptr<ClientLauncher> GetLauncher(Client client); +}; + +class VSCodeLauncher : public ClientLauncher { +public: + using ClientLauncher::ClientLauncher; + + llvm::Error Launch(const std::vector<llvm::StringRef> args) override; + + std::string GetLaunchURL(const std::vector<llvm::StringRef> args) const; + static std::string URLEncode(llvm::StringRef str); +}; + +class VSCodeURLPrinter : public VSCodeLauncher { + using VSCodeLauncher::VSCodeLauncher; + + llvm::Error Launch(const std::vector<llvm::StringRef> args) override; +}; + +} // namespace lldb_dap + +#endif diff --git a/lldb/tools/lldb-dap/tool/Options.td b/lldb/tools/lldb-dap/tool/Options.td index 5e9dd7a1d6419..339a64fed6c32 100644 --- a/lldb/tools/lldb-dap/tool/Options.td +++ b/lldb/tools/lldb-dap/tool/Options.td @@ -82,3 +82,11 @@ def connection_timeout: S<"connection-timeout">, "timeout is reached, the server will be closed and the process will exit. " "Not specifying this argument or specifying non-positive values will " "cause the server to wait for new connections indefinitely.">; + +def client + : S<"client">, + MetaVarName<"<client>">, + HelpText< + "Use lldb-dap as a launcher for a curated number of DAP client.">; + +def REM : R<["--"], "">; diff --git a/lldb/tools/lldb-dap/tool/lldb-dap.cpp b/lldb/tools/lldb-dap/tool/lldb-dap.cpp index 45caa1a81059b..f10ed12344cbd 100644 --- a/lldb/tools/lldb-dap/tool/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/tool/lldb-dap.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "ClientLauncher.h" #include "DAP.h" #include "DAPLog.h" #include "EventHelper.h" @@ -141,6 +142,12 @@ static void PrintHelp(LLDBDAPOptTable &table, llvm::StringRef tool_name) { debugger to attach to the process. lldb-dap -g + + You can also use lldb-dap to launch a supported client, for example the + LLDB-DAP Visual Studio Code extension. + + lldb-dap --client vscode -- /path/to/binary <args> + )___"; } @@ -150,6 +157,29 @@ static void PrintVersion() { llvm::outs() << "liblldb: " << lldb::SBDebugger::GetVersionString() << '\n'; } +static llvm::Error LaunchClient(const llvm::opt::InputArgList &args) { + auto *client_arg = args.getLastArg(OPT_client); + assert(client_arg && "must have client arg"); + + std::optional<ClientLauncher::Client> client = + ClientLauncher::GetClientFrom(client_arg->getValue()); + if (!client) + return llvm::createStringError( + llvm::formatv("unsupported client: {0}", client_arg->getValue())); + + std::vector<llvm::StringRef> launch_args; + if (auto *arg = args.getLastArgNoClaim(OPT_REM)) { + for (auto *value : arg->getValues()) { + launch_args.push_back(value); + } + } + + if (launch_args.empty()) + return llvm::createStringError("no launch arguments provided"); + + return ClientLauncher::GetLauncher(*client)->Launch(launch_args); +} + #if not defined(_WIN32) struct FDGroup { int GetFlags() const { @@ -541,6 +571,14 @@ int main(int argc, char *argv[]) { return EXIT_SUCCESS; } + if (input_args.hasArg(OPT_client)) { + if (llvm::Error error = LaunchClient(input_args)) { + llvm::WithColor::error() << llvm::toString(std::move(error)) << '\n'; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; + } + ReplMode default_repl_mode = ReplMode::Auto; if (input_args.hasArg(OPT_repl_mode)) { llvm::opt::Arg *repl_mode = input_args.getLastArg(OPT_repl_mode); diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt index a08414c30e6cd..b1fdef18fddba 100644 --- a/lldb/unittests/DAP/CMakeLists.txt +++ b/lldb/unittests/DAP/CMakeLists.txt @@ -1,4 +1,5 @@ add_lldb_unittest(DAPTests + ClientLauncherTest.cpp DAPErrorTest.cpp DAPTest.cpp DAPTypesTest.cpp diff --git a/lldb/unittests/DAP/ClientLauncherTest.cpp b/lldb/unittests/DAP/ClientLauncherTest.cpp new file mode 100644 index 0000000000000..dbaf9ee786336 --- /dev/null +++ b/lldb/unittests/DAP/ClientLauncherTest.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ClientLauncher.h" +#include "llvm/ADT/StringRef.h" +#include "gtest/gtest.h" +#include <optional> + +using namespace lldb_dap; +using namespace llvm; + +TEST(ClientLauncherTest, GetClientFromVSCode) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("vscode"); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(ClientLauncher::VSCode, result.value()); +} + +TEST(ClientLauncherTest, GetClientFromVSCodeUpperCase) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("VSCODE"); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(ClientLauncher::VSCode, result.value()); +} + +TEST(ClientLauncherTest, GetClientFromVSCodeMixedCase) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("VSCode"); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(ClientLauncher::VSCode, result.value()); +} + +TEST(ClientLauncherTest, GetClientFromInvalidString) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("invalid"); + EXPECT_FALSE(result.has_value()); +} + +TEST(ClientLauncherTest, GetClientFromEmptyString) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom(""); + EXPECT_FALSE(result.has_value()); +} + +TEST(ClientLauncherTest, URLEncode) { + EXPECT_EQ("", VSCodeLauncher::URLEncode("")); + EXPECT_EQ( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.~", + VSCodeLauncher::URLEncode("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRST" + "UVWXYZ0123456789-_.~")); + EXPECT_EQ("hello%20world", VSCodeLauncher::URLEncode("hello world")); + EXPECT_EQ("hello%21%40%23%24", VSCodeLauncher::URLEncode("hello!@#$")); + EXPECT_EQ("%2Fpath%2Fto%2Ffile", VSCodeLauncher::URLEncode("/path/to/file")); + EXPECT_EQ("key%3Dvalue%26key2%3Dvalue2", + VSCodeLauncher::URLEncode("key=value&key2=value2")); + EXPECT_EQ("100%25complete", VSCodeLauncher::URLEncode("100%complete")); + EXPECT_EQ("file_name%20with%20spaces%20%26%20special%21.txt", + VSCodeLauncher::URLEncode("file_name with spaces & special!.txt")); + EXPECT_EQ("%00%01%02", + VSCodeLauncher::URLEncode(llvm::StringRef("\x00\x01\x02", 3))); + EXPECT_EQ("test-file_name.txt~", + VSCodeLauncher::URLEncode("test-file_name.txt~")); + + // UTF-8 encoded characters should be percent-encoded byte by byte. + EXPECT_EQ("%C3%A9", VSCodeLauncher::URLEncode("é")); +} From 5e8a0d64952f374ea0194d3d2876d1deeb8c5320 Mon Sep 17 00:00:00 2001 From: Craig Topper <craig.topper@sifive.com> Date: Mon, 3 Nov 2025 13:37:46 -0800 Subject: [PATCH 079/313] [RISCV] Use reportFatalUsageError in a few places (#166218) Makes it consistent with feedback given in the equivalent GISel code. https://github.com/llvm/llvm-project/pull/165876 --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index b25a05400fe31..907833513c5d1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -371,8 +371,8 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned NF, bool IsMasked, RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); } const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo( NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -444,8 +444,8 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked, RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); } const RISCV::VSXSEGPseudo *P = RISCV::getVSXSEGPseudo( NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -2223,8 +2223,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for " + "index values when XLEN=32"); } const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo( IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -2457,8 +2457,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for " + "index values when XLEN=32"); } const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo( IsMasked, IsOrdered, IndexLog2EEW, From bc0d0cf3ac9b5f1c2ec33d22aba8e7ece7d08ed2 Mon Sep 17 00:00:00 2001 From: Doug Wyatt <doug@sonosphere.com> Date: Mon, 3 Nov 2025 13:39:56 -0800 Subject: [PATCH 080/313] [Clang] FunctionEffect analysis was missing a CXXBindTemporaryExpr's implicit call to a destructor. (#166110) This example is reduced from a discovery: resetting a shared pointer from a nonblocking function is not diagnosed. ``` void nb23() { struct X { int *ptr = nullptr; X() {} ~X() { delete ptr; } }; auto inner = []() [[clang::nonblocking]] { X(); }; } ``` `shared_ptr<T>::reset()` creates a temporary `shared_ptr` and swaps it with its current state. The temporary `shared_ptr` constructor is nonblocking but its destructor potentially deallocates memory and is unsafe. Analysis was ignoring the implicit call in the AST to destroy the temporary. --------- Co-authored-by: Doug Wyatt <dwyatt@apple.com> --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaFunctionEffects.cpp | 8 ++++++ .../Sema/attr-nonblocking-constraints.cpp | 27 +++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index cd272396252d0..db695d86b5416 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -460,6 +460,7 @@ Bug Fixes to Attribute Support - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075) - Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905) - Fix handling of parameter indexes when an attribute is applied to a C++23 explicit object member function. +- Fixed several false positives and false negatives in function effect (`nonblocking`) analysis. (#GH166078) (#GH166101) (#GH166110) Bug Fixes to C++ Support ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp index 5459861ec349d..4b63eb7df1054 100644 --- a/clang/lib/Sema/SemaFunctionEffects.cpp +++ b/clang/lib/Sema/SemaFunctionEffects.cpp @@ -1279,7 +1279,15 @@ class Analyzer { const CXXConstructorDecl *Ctor = Construct->getConstructor(); CallableInfo CI(*Ctor); followCall(CI, Construct->getLocation()); + return true; + } + bool VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *BTE) override { + const CXXDestructorDecl *Dtor = BTE->getTemporary()->getDestructor(); + if (Dtor != nullptr) { + CallableInfo CI(*Dtor); + followCall(CI, BTE->getBeginLoc()); + } return true; } diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp index 0d2dbb4947dc8..881e816292d59 100644 --- a/clang/test/Sema/attr-nonblocking-constraints.cpp +++ b/clang/test/Sema/attr-nonblocking-constraints.cpp @@ -373,6 +373,33 @@ struct Unsafe { Unsafe(float y) [[clang::nonblocking]] : Unsafe(int(y)) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}} }; +// Exercise cases of a temporary with a safe constructor and unsafe destructor. +void nb23() +{ + struct X { + int *ptr = nullptr; + X() {} + ~X() { delete ptr; } // expected-note 2 {{destructor cannot be inferred 'nonblocking' because it allocates or deallocates memory}} + }; + + auto inner = []() [[clang::nonblocking]] { + X(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}} + }; + + auto inner2 = [](X x) [[clang::nonblocking]] { // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}} + }; + +} + +struct S2 { ~S2(); }; // expected-note 2 {{declaration cannot be inferred 'nonblocking' because it has no definition in this translation unit}} +void nb24() { + S2 s; + [&]() [[clang::nonblocking]] { + [s]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}} + [=]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}} + }(); +} + struct DerivedFromUnsafe : public Unsafe { DerivedFromUnsafe() [[clang::nonblocking]] {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}} DerivedFromUnsafe(int x) [[clang::nonblocking]] : Unsafe(x) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}} From 1d8d8dc6b53aea14d5a60169e426416af7cc273f Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com> Date: Mon, 3 Nov 2025 21:43:11 +0000 Subject: [PATCH 081/313] [gn build] Port 1621486d676f --- llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn index a42f781271a21..b6c2f465a7292 100644 --- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn @@ -21,6 +21,7 @@ static_library("lib") { sources = [ "Breakpoint.cpp", "BreakpointBase.cpp", + "ClientLauncher.cpp", "CommandPlugins.cpp", "DAP.cpp", "DAPError.cpp", From a8de6499c240abbcb2c6b51e6b59cc048a7533c2 Mon Sep 17 00:00:00 2001 From: Dan Liew <dan@su-root.co.uk> Date: Mon, 3 Nov 2025 13:54:22 -0800 Subject: [PATCH 082/313] [NFC][LLDB][BoundsSatety] Add `InstrumentationRuntime::MatchAllModules` (#166001) This adds a virtual method that allows `InstrumentationRuntime` sub classes to match against all modules rather than just a library that matches a particular regex. When the implementation returns true `GetPatternForRuntimeLibrary()` is ignored and all modules are iterated over. The default implementation returns false which was the previous behavior which uses `GetPatternForRuntimeLibrary()` to only match a particular runtime library. The intended use case here is for implementing an `InstrumentationRuntime` where the runtime library of interest can have multiple implementations and whose name is not known ahead of time. The concrete use case here is for a `InstrumentationRuntime` plugin for implementations of the `-fbounds-safety` soft-trap runtime which can have multiple different implementations and so the module containing the runtime functions isn't known ahead of time. This plug-in will be upstreamed as part of the process of upstreaming `-fbounds-safety`. An alternative to this would be for the `GetPatternForRuntimeLibrary()` function to return a regex that matches everything. While that technically works this new API more clearly indicates in the intent. We probably also save a little perf by not executing the regex match for every loaded module but I have not measured this. rdar://163230807 --- lldb/include/lldb/Target/InstrumentationRuntime.h | 7 +++++++ lldb/source/Target/InstrumentationRuntime.cpp | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lldb/include/lldb/Target/InstrumentationRuntime.h b/lldb/include/lldb/Target/InstrumentationRuntime.h index a6121c24b9560..d2499528e97ab 100644 --- a/lldb/include/lldb/Target/InstrumentationRuntime.h +++ b/lldb/include/lldb/Target/InstrumentationRuntime.h @@ -73,6 +73,13 @@ class InstrumentationRuntime /// is guaranteed to be loaded. virtual void Activate() = 0; + /// \return true if `CheckIfRuntimeIsValid` should be called on all modules. + /// In this case the return value of `GetPatternForRuntimeLibrary` will be + /// ignored. Return false if `CheckIfRuntimeIsValid` should only be called + /// for modules whose name matches `GetPatternForRuntimeLibrary`. + /// + virtual bool MatchAllModules() { return false; } + public: static void ModulesDidLoad(lldb_private::ModuleList &module_list, Process *process, diff --git a/lldb/source/Target/InstrumentationRuntime.cpp b/lldb/source/Target/InstrumentationRuntime.cpp index 7e58e8bf26cb1..d9800a8541f4e 100644 --- a/lldb/source/Target/InstrumentationRuntime.cpp +++ b/lldb/source/Target/InstrumentationRuntime.cpp @@ -55,7 +55,8 @@ void InstrumentationRuntime::ModulesDidLoad( return IterationAction::Continue; const RegularExpression &runtime_regex = GetPatternForRuntimeLibrary(); - if (runtime_regex.Execute(file_spec.GetFilename().GetCString()) || + if (MatchAllModules() || + runtime_regex.Execute(file_spec.GetFilename().GetCString()) || module_sp->IsExecutable()) { if (CheckIfRuntimeIsValid(module_sp)) { SetRuntimeModuleSP(module_sp); From f77ce52b56d025399f489a8c0aad8c18c4b06045 Mon Sep 17 00:00:00 2001 From: jinge90 <ge.jin@intel.com> Date: Tue, 4 Nov 2025 05:55:38 +0800 Subject: [PATCH 083/313] [Driver][NFC] Don't specify offloading model in help text for -Xarch_device/host (#165503) Current implementation for -Xarch_device/host can work in any offloading model besides CUDA/HIP, so remove the specific offloading model in help text to align with implementation. --------- Signed-off-by: jinge90 <ge.jin@intel.com> Co-authored-by: Alexey Bader <alexey.bader@intel.com> Co-authored-by: Joseph Huber <huberjn@outlook.com> --- clang/include/clang/Driver/Options.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 5cf332783cbc3..11e81e032d5fc 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -951,9 +951,9 @@ def Xarch__ the host system, which can be used to suppress incompatible GPU arguments.}]>, MetaVarName<"<arch> <arg>">; def Xarch_host : Separate<["-"], "Xarch_host">, Flags<[NoXarchOption]>, - HelpText<"Pass <arg> to the CUDA/HIP host compilation">, MetaVarName<"<arg>">; + HelpText<"Pass <arg> to host compilation in the offloading toolchain">, MetaVarName<"<arg>">; def Xarch_device : Separate<["-"], "Xarch_device">, Flags<[NoXarchOption]>, - HelpText<"Pass <arg> to the CUDA/HIP device compilation">, MetaVarName<"<arg>">; + HelpText<"Pass <arg> to device compilation in the offloading toolchain">, MetaVarName<"<arg>">; def Xassembler : Separate<["-"], "Xassembler">, HelpText<"Pass <arg> to the assembler">, MetaVarName<"<arg>">, Group<CompileOnly_Group>; From ac21fde46488c8c46f1cabf8205ecd73c3276765 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com> Date: Mon, 3 Nov 2025 11:56:22 -1000 Subject: [PATCH 084/313] [flang][cuda] Add support for f16 atomicadd (#166229) --- .../flang/Optimizer/Builder/IntrinsicCall.h | 2 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 43 ++++++++++++++++++- flang/module/cudadevice.f90 | 5 +++ flang/test/Lower/CUDA/cuda-device-proc.cuf | 4 ++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 3407dd01dd504..9f15ce68eb3d5 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -188,6 +188,8 @@ struct IntrinsicLibrary { fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicAddR2(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genAtomicCas(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 15ea84565dd75..6be4d9ce0a46c 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -294,6 +294,10 @@ static constexpr IntrinsicHandler handlers[]{ {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, + {"atomicaddr2", + &I::genAtomicAddR2, + {{{"a", asAddr}, {"v", asAddr}}}, + false}, {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomiccasd", &I::genAtomicCas, @@ -3119,7 +3123,6 @@ static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { assert(args.size() == 2); - mlir::LLVM::AtomicBinOp binOp = mlir::isa<mlir::IntegerType>(args[1].getType()) ? mlir::LLVM::AtomicBinOp::add @@ -3127,6 +3130,44 @@ mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType, return genAtomBinOp(builder, loc, binOp, args[0], args[1]); } +fir::ExtendedValue +IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + + mlir::Value a = fir::getBase(args[0]); + + if (mlir::isa<fir::BaseBoxType>(a.getType())) { + a = fir::BoxAddrOp::create(builder, loc, a); + } + + auto loc = builder.getUnknownLoc(); + auto f16Ty = builder.getF16Type(); + auto i32Ty = builder.getI32Type(); + auto vecF16Ty = mlir::VectorType::get({2}, f16Ty); + mlir::Type idxTy = builder.getIndexType(); + auto f16RefTy = fir::ReferenceType::get(f16Ty); + auto zero = builder.createIntegerConstant(loc, idxTy, 0); + auto one = builder.createIntegerConstant(loc, idxTy, 1); + auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, + fir::getBase(args[1]), zero); + auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, + fir::getBase(args[1]), one); + auto v1 = fir::LoadOp::create(builder, loc, v1Coord); + auto v2 = fir::LoadOp::create(builder, loc, v2Coord); + mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty); + mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); + mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( + builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); + auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); + auto i32VecTy = mlir::VectorType::get({1}, i32Ty); + mlir::Value vecI32 = + mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res); + return mlir::vector::ExtractOp::create(builder, loc, vecI32, + mlir::ArrayRef<int64_t>{0}); +} + mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { assert(args.size() == 2); diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 59af58ddcd32e..7a764b589dc56 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1171,6 +1171,11 @@ attributes(device) pure integer(8) function atomicaddl(address, val) integer(8), intent(inout) :: address integer(8), value :: val end function + attributes(device) pure integer(4) function atomicaddr2(address, val) + !dir$ ignore_tkr (rd) address, (d) val + real(2), dimension(2), intent(inout) :: address + real(2), dimension(2), intent(in) :: val + end function end interface interface atomicsub diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 09b4302446ee7..674548b7489e8 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -14,6 +14,8 @@ attributes(global) subroutine devsub() integer :: smalltime integer(4) :: res, offset integer(8) :: resl + real(2) :: r2a(2) + real(2) :: tmp2(2) integer :: tid tid = threadIdx%x @@ -34,6 +36,7 @@ attributes(global) subroutine devsub() al = atomicadd(al, 1_8) af = atomicadd(af, 1.0_4) ad = atomicadd(ad, 1.0_8) + ai = atomicadd(r2a, tmp2) ai = atomicsub(ai, 1_4) al = atomicsub(al, 1_8) @@ -128,6 +131,7 @@ end ! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64 +! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16> ! CHECK: %{{.*}} = llvm.atomicrmw sub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 ! CHECK: %{{.*}} = llvm.atomicrmw sub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 From f62ff29157cb75a17b087475168fa15ddcb7c059 Mon Sep 17 00:00:00 2001 From: Jakub Kuderski <jakub@nod-labs.com> Date: Mon, 3 Nov 2025 17:00:57 -0500 Subject: [PATCH 085/313] [mlir][spirv] Fix Intel SubgroupBlock* op tests (#166185) `spirv-val` learned how to check related ops. Move them to a new test file and set the required capabilities. Closes: https://github.com/llvm/llvm-project/issues/166184 --- mlir/test/Target/SPIRV/group-ops.mlir | 30 +++------------- .../Target/SPIRV/subgroup-block-intel.mlir | 34 +++++++++++++++++++ 2 files changed, 38 insertions(+), 26 deletions(-) create mode 100644 mlir/test/Target/SPIRV/subgroup-block-intel.mlir diff --git a/mlir/test/Target/SPIRV/group-ops.mlir b/mlir/test/Target/SPIRV/group-ops.mlir index cf519cba961c5..6f19b3553dd37 100644 --- a/mlir/test/Target/SPIRV/group-ops.mlir +++ b/mlir/test/Target/SPIRV/group-ops.mlir @@ -1,11 +1,13 @@ -// RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s +// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file %s | FileCheck %s // RUN: %if spirv-tools %{ rm -rf %t %} // RUN: %if spirv-tools %{ mkdir %t %} // RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %} // RUN: %if spirv-tools %{ spirv-val %t %} -spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, SubgroupBallotKHR, Groups, SubgroupBufferBlockIOINTEL, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_INTEL_subgroups, SPV_KHR_uniform_group_instructions]> { +spirv.module Logical GLSL450 requires #spirv.vce<v1.3, + [Shader, Linkage, SubgroupBallotKHR, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], + [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_KHR_uniform_group_instructions]> { // CHECK-LABEL: @subgroup_ballot spirv.func @subgroup_ballot(%predicate: i1) -> vector<4xi32> "None" { // CHECK: %{{.*}} = spirv.KHR.SubgroupBallot %{{.*}}: vector<4xi32> @@ -24,30 +26,6 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, Subgrou %0 = spirv.GroupBroadcast <Workgroup> %value, %localid : f32, vector<3xi32> spirv.ReturnValue %0: f32 } - // CHECK-LABEL: @subgroup_block_read_intel - spirv.func @subgroup_block_read_intel(%ptr : !spirv.ptr<i32, StorageBuffer>) -> i32 "None" { - // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> i32 - %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> i32 - spirv.ReturnValue %0: i32 - } - // CHECK-LABEL: @subgroup_block_read_intel_vector - spirv.func @subgroup_block_read_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>) -> vector<3xi32> "None" { - // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32> - %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32> - spirv.ReturnValue %0: vector<3xi32> - } - // CHECK-LABEL: @subgroup_block_write_intel - spirv.func @subgroup_block_write_intel(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: i32) -> () "None" { - // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : i32 - spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : i32 - spirv.Return - } - // CHECK-LABEL: @subgroup_block_write_intel_vector - spirv.func @subgroup_block_write_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: vector<3xi32>) -> () "None" { - // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : vector<3xi32> - spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : vector<3xi32> - spirv.Return - } // CHECK-LABEL: @group_iadd spirv.func @group_iadd(%value: i32) -> i32 "None" { // CHECK: spirv.GroupIAdd <Workgroup> <Reduce> %{{.*}} : i32 diff --git a/mlir/test/Target/SPIRV/subgroup-block-intel.mlir b/mlir/test/Target/SPIRV/subgroup-block-intel.mlir new file mode 100644 index 0000000000000..14060e632fffd --- /dev/null +++ b/mlir/test/Target/SPIRV/subgroup-block-intel.mlir @@ -0,0 +1,34 @@ +// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s + +// RUN: %if spirv-tools %{ rm -rf %t %} +// RUN: %if spirv-tools %{ mkdir %t %} +// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --spirv-save-validation-files-with-prefix=%t/module %s %} +// RUN: %if spirv-tools %{ spirv-val %t %} + +spirv.module Physical64 GLSL450 requires #spirv.vce<v1.3, [Addresses, Shader, Linkage, SubgroupBufferBlockIOINTEL], + [SPV_KHR_storage_buffer_storage_class, SPV_INTEL_subgroups]> { + // CHECK-LABEL: @subgroup_block_read_intel + spirv.func @subgroup_block_read_intel(%ptr : !spirv.ptr<i32, StorageBuffer>) -> i32 "None" { + // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> i32 + %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> i32 + spirv.ReturnValue %0: i32 + } + // CHECK-LABEL: @subgroup_block_read_intel_vector + spirv.func @subgroup_block_read_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>) -> vector<3xi32> "None" { + // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32> + %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32> + spirv.ReturnValue %0: vector<3xi32> + } + // CHECK-LABEL: @subgroup_block_write_intel + spirv.func @subgroup_block_write_intel(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: i32) -> () "None" { + // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : i32 + spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : i32 + spirv.Return + } + // CHECK-LABEL: @subgroup_block_write_intel_vector + spirv.func @subgroup_block_write_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: vector<3xi32>) -> () "None" { + // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : vector<3xi32> + spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : vector<3xi32> + spirv.Return + } +} From 2c8781de64891d1614bf54dd9e5e4f183a8acca3 Mon Sep 17 00:00:00 2001 From: Jacob Lambert <jacob.lambert@amd.com> Date: Mon, 3 Nov 2025 14:09:09 -0800 Subject: [PATCH 086/313] Revert "[Clang] Make the AS of llvm.compiler.used & llvm.used elements addrspace(0)" (#166242) Reverts llvm/llvm-project#164432 Breaks Comgr tests with the following: [2025-11-03T19:18:20.564Z] + clang -x hip --offload-arch=amdgcnspirv -nogpulib -nogpuinc --no-gpu-bundle-output --offload-device-only -O3 /jenkins/workspace/compiler-psdb-amd-staging/repos/llvm-project/amd/comgr/test-lit/spirv-tests/spirv-to-reloc.hip -o /jenkins/workspace/compiler-psdb-amd-staging/repos/out/ubuntu-22.04/22.04/build/amd_comgr/test-lit/spirv-tests/Output/spirv-to-reloc.hip.tmp.spv -fvisibility=hidden -fno-autolink -fexceptions -fcolor-diagnostics [2025-11-03T19:18:20.564Z] InvalidModule: Invalid SPIR-V module: Casts from private/local/global address space are allowed only to generic [2025-11-03T19:18:20.564Z] [2025-11-03T19:18:20.564Z] <badref> = addrspacecast ptr addrspace(1) @__hip_cuid_94fb83be5559070 to ptr [2025-11-03T19:18:20.564Z] clang: error: amdgcn-link command failed with exit code 10 (use -v to see invocation) --- clang/lib/CodeGen/CodeGenModule.cpp | 12 ++-- .../embed-bitcode-marker-with-nonzero-as.c | 2 +- .../llvm_compiler_used_elements_are_unqual.c | 64 ------------------- 3 files changed, 7 insertions(+), 71 deletions(-) delete mode 100644 clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index af5be95aec1cd..0fea57b2e1799 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -3331,18 +3331,18 @@ static void emitUsed(CodeGenModule &CGM, StringRef Name, if (List.empty()) return; - llvm::PointerType *UnqualPtr = - llvm::PointerType::getUnqual(CGM.getLLVMContext()); - // Convert List to what ConstantArray needs. SmallVector<llvm::Constant*, 8> UsedArray; UsedArray.resize(List.size()); for (unsigned i = 0, e = List.size(); i != e; ++i) { - UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - cast<llvm::Constant>(&*List[i]), UnqualPtr); + UsedArray[i] = + llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<llvm::Constant>(&*List[i]), CGM.Int8PtrTy); } - llvm::ArrayType *ATy = llvm::ArrayType::get(UnqualPtr, UsedArray.size()); + if (UsedArray.empty()) + return; + llvm::ArrayType *ATy = llvm::ArrayType::get(CGM.Int8PtrTy, UsedArray.size()); auto *GV = new llvm::GlobalVariable( CGM.getModule(), ATy, false, llvm::GlobalValue::AppendingLinkage, diff --git a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c index 8af9708a1bfb8..df7118859c764 100644 --- a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c +++ b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c @@ -3,6 +3,6 @@ // CHECK: @llvm.embedded.module = private addrspace(1) constant [0 x i8] zeroinitializer, section ".llvmbc", align 1 // CHECK-NEXT: @llvm.cmdline = private addrspace(1) constant [{{[0-9]+}} x i8] c"{{.*}}", section ".llvmcmd", align 1 -// CHECK-NEXT: @llvm.compiler.used = appending addrspace(1) global [5 x ptr] [ptr addrspacecast (ptr addrspace(1) @foo.managed to ptr), ptr addrspacecast (ptr addrspace(1) @foo to ptr), ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr), ptr addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr), ptr addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr)], section "llvm.metadata" +// CHECK-NEXT: @llvm.compiler.used = appending addrspace(1) global [5 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo.managed to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr addrspace(4))], section "llvm.metadata" __attribute__((managed)) int foo = 42; diff --git a/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c b/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c deleted file mode 100644 index b6550fb1e5c77..0000000000000 --- a/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c +++ /dev/null @@ -1,64 +0,0 @@ -// RUN: %clang_cc1 -x c -triple x86_64-- -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=X86 -// RUN: %clang_cc1 -x c -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=AMDGCN -// RUN: %clang_cc1 -x c -triple spirv64-- -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=SPIRV -// RUN: %clang_cc1 -x c -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=SPIRV_AMD -// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple x86_64-- -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=X86 -// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=AMDGCN -// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple spirv64-- -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=SPIRV_CL -// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=SPIRV_AMD_CL -// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple x86_64-- -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=X86 -// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=AMDGCN -// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple spirv64-- -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=SPIRV_CL -// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \ -// RUN: | FileCheck %s --check-prefix=SPIRV_AMD_CL - -#ifndef __OPENCL_C_VERSION__ -#define __constant const -#endif - -static __constant __attribute__((__used__)) int foo = 42; - - -// X86: @foo = internal constant i32 42 -// X86: @llvm.compiler.used = appending global [2 x ptr] [ptr @foo, ptr @bar], section "llvm.metadata" -// -// AMDGCN: @foo = internal addrspace(4) constant i32 42 -// AMDGCN: @llvm.compiler.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @foo to ptr), ptr @bar], section "llvm.metadata" -// -// SPIRV: @foo = internal constant i32 42 -// SPIRV: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr @foo, ptr @bar], section "llvm.metadata" -// -// SPIRV_CL: @foo = internal addrspace(2) constant i32 42 -// SPIRV_CL: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(2) @foo to ptr), ptr @bar], section "llvm.metadata" -// -// SPIRV_AMD: @foo = internal addrspace(1) constant i32 42 -// SPIRV_AMD: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @foo to ptr), ptr addrspacecast (ptr addrspace(4) @bar to ptr)], section "llvm.metadata" -// -// SPIRV_AMD_CL: @foo = internal addrspace(2) constant i32 42 -// SPIRV_AMD_CL: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(2) @foo to ptr), ptr addrspacecast (ptr addrspace(4) @bar to ptr)], section "llvm.metadata" -// -// X86: define internal void @bar() #{{[0-9]}} { -// -// AMDGCN: define internal void @bar() #{{[0-9]}} { -// -// SPIRV: define internal spir_func void @bar() #{{[0-9]}} { -// -// SPIRV_CL: define internal spir_func void @bar() #{{[0-9]}} { -// -// SPIRV_AMD: define internal spir_func void @bar() addrspace(4) #{{[0-9]}} { -// -// SPIRV_AMD_CL: define internal spir_func void @bar() addrspace(4) #{{[0-9]}} { -// -static void __attribute__((__used__)) bar() { -} From ed53c413ef51abca52a258c7ddc271e81ec22be0 Mon Sep 17 00:00:00 2001 From: Mircea Trofin <mtrofin@google.com> Date: Mon, 3 Nov 2025 14:12:58 -0800 Subject: [PATCH 087/313] [profcheck] Exclude more tools tests (#166239) Excluding test areas that (1) don't really pertain to the profcheck effort, and (2) are easier to maintain this way. --- llvm/test/lit.cfg.py | 5 +++++ llvm/utils/profcheck-xfail.txt | 26 -------------------------- 2 files changed, 5 insertions(+), 26 deletions(-) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 974af4b571503..cadf781b409be 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -57,8 +57,13 @@ # so we just exclude llvm-reduce tests from this config altogether. This should # be fine though as profcheck config tests are mostly concerned with opt. config.excludes.append("llvm-reduce") + # Exclude llvm-objcopy tests - not the target of this effort, and some use + # cat in ways that conflict with how profcheck uses it. + config.excludes.append("llvm-objcopy") # (Issue #161235) Temporarily exclude LoopVectorize. config.excludes.append("LoopVectorize") + # exclude UpdateTestChecks - they fail because of inserted prof annotations + config.excludes.append("UpdateTestChecks") # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index 661c88125c9c8..61bc936cd151a 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -530,32 +530,6 @@ Instrumentation/TypeSanitizer/swifterror.ll LTO/X86/diagnostic-handler-remarks-with-hotness.ll Other/optimization-remarks-auto.ll Other/X86/debugcounter-partiallyinlinelibcalls.ll -tools/llvm-objcopy/ELF/auto-remove-add-symtab-shndx.test -tools/UpdateTestChecks/update_analyze_test_checks/loop-access-analysis.test -tools/UpdateTestChecks/update_analyze_test_checks/loop-distribute.test -tools/UpdateTestChecks/update_test_checks/argument_name_reuse.test -tools/UpdateTestChecks/update_test_checks/basic.test -tools/UpdateTestChecks/update_test_checks/check_attrs.test -tools/UpdateTestChecks/update_test_checks/difile_absolute_filenames.test -tools/UpdateTestChecks/update_test_checks/filter_out_after.test -tools/UpdateTestChecks/update_test_checks/generated_funcs_prefix_reuse.test -tools/UpdateTestChecks/update_test_checks/generated_funcs.test -tools/UpdateTestChecks/update_test_checks/global_preserve_name.test -tools/UpdateTestChecks/update_test_checks/if_target.test -tools/UpdateTestChecks/update_test_checks/named_function_arguments_split.test -tools/UpdateTestChecks/update_test_checks/on_the_fly_arg_change.test -tools/UpdateTestChecks/update_test_checks/phi-labels.test -tools/UpdateTestChecks/update_test_checks/pre-process.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values2.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values3.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values4.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values5.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values6.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values.test -tools/UpdateTestChecks/update_test_checks/switch_case.test -tools/UpdateTestChecks/update_test_checks/tbaa-semantics-checks.test -tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll Transforms/AtomicExpand/AArch64/pcsections.ll From a522ae3ef6e13cb39e7756c151652e03a024b301 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Mon, 3 Nov 2025 14:17:39 -0800 Subject: [PATCH 088/313] ARM: Remove unnecessary manual ABI lowering for sincos_stret (#166040) LowerCallTo handles all of the ABI details, including the load of implicit sret return to the expected result positions. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 39 ++----------------------- 1 file changed, 2 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 6b0653457cbaf..3a00267395504 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -9869,32 +9869,12 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin()); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); // Pair of floats / doubles used to pass the result. Type *RetTy = StructType::get(ArgTy, ArgTy); auto &DL = DAG.getDataLayout(); ArgListTy Args; - bool ShouldUseSRet = getTM().isAPCS_ABI(); - SDValue SRet; - if (ShouldUseSRet) { - // Create stack object for sret. - const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const Align StackAlign = DL.getPrefTypeAlign(RetTy); - int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); - SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); - - ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext())); - Entry.IsSExt = false; - Entry.IsZExt = false; - Entry.IsSRet = true; - Args.push_back(Entry); - RetTy = Type::getVoidTy(*DAG.getContext()); - } - Args.emplace_back(Arg, ArgTy); StringRef LibcallName = getLibcallImplName(SincosStret); @@ -9904,25 +9884,10 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) - .setCallee(CC, RetTy, Callee, std::move(Args)) - .setDiscardResult(ShouldUseSRet); + .setCallee(CC, RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - if (!ShouldUseSRet) - return CallResult.first; - - SDValue LoadSin = - DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); - - // Address of cos field. - SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, - DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); - SDValue LoadCos = - DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); - - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, - LoadSin.getValue(0), LoadCos.getValue(0)); + return CallResult.first; } SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, From 3c2c9d5bc1e2c8a1648f77d4a4a574c56c392442 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Mon, 3 Nov 2025 14:18:04 -0800 Subject: [PATCH 089/313] DAG: Cleanup string bool attribute check for disable-tail-calls (#166237) --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index a52265055c88a..fa0c899dfcc27 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8958,9 +8958,8 @@ bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const { // Avoid emitting tail calls in functions with the disable-tail-calls // attribute. const Function *Caller = CB.getParent()->getParent(); - if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() == - "true" && - !isMustTailCall) + if (!isMustTailCall && + Caller->getFnAttribute("disable-tail-calls").getValueAsBool()) return false; // We can't tail call inside a function with a swifterror argument. Lowering From ecaaebf8f0f70a0b6e21c19fdd4be23ffe23ef53 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Mon, 3 Nov 2025 14:19:58 -0800 Subject: [PATCH 090/313] X86: Correct IR type used for sincos_stret return value (#166240) Match the IR type that clang uses here: https://godbolt.org/z/KzbodEcxh This was manually selecting the IR legal type. Instead just set the flag to ensure legal types. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2970cf42df731..b97b5089cb0a3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33034,12 +33034,13 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) - : (Type *)FixedVectorType::get(ArgTy, 4); + : (Type *)FixedVectorType::get(ArgTy, 2); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) - .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); + .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)) + .setIsPostTypeLegalization(); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); From 5d9d89092e0fd34f3ea2ca15c9f30d4c01bb29f3 Mon Sep 17 00:00:00 2001 From: Stefan Mada <smada@nvidia.com> Date: Mon, 3 Nov 2025 14:22:25 -0800 Subject: [PATCH 091/313] [NVPTX] Add more clear error message for using invalid syncscope (#165737) Using invalid syncscopes on certain NVVM intrinsics causes an obscure error to appear: (error 9: NVVM_ERROR_COMPILATION), libNVVM extra log: Could not find scope ID=5. This is not a very helpful error. A much more useful error would be something like 'NVPTX does not support syncscope "agent"' This would immediately make it clear that the issue is not NVPTX specific, but actually from code being fed to NVPTX. This would save users time in debugging issues related to this. --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 22 ++++++++++++++----- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 + .../cmpxchg-unsupported-syncscope.err.ll | 11 ++++++++++ 3 files changed, 28 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index c667a09f95dbb..996d653940118 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1836,7 +1836,7 @@ bool NVPTXDAGToDAGISel::tryFence(SDNode *N) { return true; } -NVPTXScopes::NVPTXScopes(LLVMContext &C) { +NVPTXScopes::NVPTXScopes(LLVMContext &C) : Context(&C) { Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread; Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System; Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block; @@ -1851,11 +1851,21 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const { auto S = Scopes.find(ID); if (S == Scopes.end()) { - // TODO: - // - Add API to LLVMContext to get the name of a single scope. - // - Use that API here to print an error containing the name - // of this Unknown ID. - report_fatal_error(formatv("Could not find scope ID={}.", int(ID))); + auto scopeName = Context->getSyncScopeName(ID); + assert(scopeName.has_value() && "Scope name must exist."); + + // Build list of supported syncscopes programmatically + SmallVector<StringRef> supportedScopes; + for (const auto &Entry : Scopes) { + if (auto name = Context->getSyncScopeName(Entry.first)) + supportedScopes.push_back(name->empty() ? "<empty string>" : *name); + } + + reportFatalUsageError( + formatv("NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n" + "Supported syncscopes are: {2}.", + scopeName.value(), int(ID), + make_range(supportedScopes.begin(), supportedScopes.end()))); } return S->second; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 1cb579bd96730..d525531766ddf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -35,6 +35,7 @@ struct NVPTXScopes { private: SmallMapVector<SyncScope::ID, NVPTX::Scope, 8> Scopes{}; + LLVMContext *Context = nullptr; }; class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll new file mode 100644 index 0000000000000..d3853e2fdaa88 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll @@ -0,0 +1,11 @@ +; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s 2>&1 | FileCheck %s + +; Test that we get a clear error message when using an unsupported syncscope. + +; CHECK: NVPTX backend does not support syncscope "agent" +; CHECK: Supported syncscopes are: singlethread, <empty string>, block, cluster, device +define i32 @cmpxchg_unsupported_syncscope_agent(ptr %addr, i32 %cmp, i32 %new) { + %result = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("agent") monotonic monotonic + %value = extractvalue { i32, i1 } %result, 0 + ret i32 %value +} From 0e3612e751381d35a3826a67527845e23bbd22af Mon Sep 17 00:00:00 2001 From: Christopher Ferris <cferris1000@users.noreply.github.com> Date: Mon, 3 Nov 2025 14:31:11 -0800 Subject: [PATCH 092/313] [scudo] Add config option to modify get usable size behavior (#158710) Currently, Scudo always returns the exact size allocated when calling getUsableSize. This can be a performance issue where some programs will get the usable size and do unnecessary calls to realloc since they think there isn't enough space in the allocation. By default, usable size will still return the exact size of the allocation. Note that if the exact behavior is disabled and MTE is on, then the code will still give an exact usable size. --- .../lib/scudo/standalone/allocator_config.def | 4 + compiler-rt/lib/scudo/standalone/combined.h | 88 +++++-- .../scudo/standalone/tests/combined_test.cpp | 242 ++++++++++++++++++ .../standalone/tests/wrappers_c_test.cpp | 9 +- 4 files changed, 324 insertions(+), 19 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.def b/compiler-rt/lib/scudo/standalone/allocator_config.def index 748530820cd64..0aea7b8f2fb9a 100644 --- a/compiler-rt/lib/scudo/standalone/allocator_config.def +++ b/compiler-rt/lib/scudo/standalone/allocator_config.def @@ -57,6 +57,10 @@ BASE_OPTIONAL(const bool, MaySupportMemoryTagging, false) // Disable the quarantine code. BASE_OPTIONAL(const bool, QuarantineDisabled, false) +// If set to true, malloc_usable_size returns the exact size of the allocation. +// If set to false, return the total available size in the allocation. +BASE_OPTIONAL(const bool, ExactUsableSize, true) + // PRIMARY_REQUIRED_TYPE(NAME) // // SizeClassMap to use with the Primary. diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index 329ec4596482b..ffe9554203241 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -706,19 +706,26 @@ class Allocator { if (!getChunkFromBlock(Block, &Chunk, &Header) && !getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) return; - } else { - if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) - return; + } else if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) { + return; } - if (Header.State == Chunk::State::Allocated) { - uptr TaggedChunk = Chunk; - if (allocatorSupportsMemoryTagging<AllocatorConfig>()) - TaggedChunk = untagPointer(TaggedChunk); - if (useMemoryTagging<AllocatorConfig>(Primary.Options.load())) - TaggedChunk = loadTag(Chunk); - Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header), - Arg); + + if (Header.State != Chunk::State::Allocated) + return; + + uptr TaggedChunk = Chunk; + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) + TaggedChunk = untagPointer(TaggedChunk); + uptr Size; + if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load()))) { + TaggedChunk = loadTag(Chunk); + Size = getSize(reinterpret_cast<void *>(Chunk), &Header); + } else if (AllocatorConfig::getExactUsableSize()) { + Size = getSize(reinterpret_cast<void *>(Chunk), &Header); + } else { + Size = getUsableSize(reinterpret_cast<void *>(Chunk), &Header); } + Callback(TaggedChunk, Size, Arg); }; Primary.iterateOverBlocks(Lambda); Secondary.iterateOverBlocks(Lambda); @@ -759,16 +766,50 @@ class Allocator { return false; } - // Return the usable size for a given chunk. Technically we lie, as we just - // report the actual size of a chunk. This is done to counteract code actively - // writing past the end of a chunk (like sqlite3) when the usable size allows - // for it, which then forces realloc to copy the usable size of a chunk as - // opposed to its actual size. + ALWAYS_INLINE uptr getUsableSize(const void *Ptr, + Chunk::UnpackedHeader *Header) { + void *BlockBegin = getBlockBegin(Ptr, Header); + if (LIKELY(Header->ClassId)) { + return SizeClassMap::getSizeByClassId(Header->ClassId) - + (reinterpret_cast<uptr>(Ptr) - reinterpret_cast<uptr>(BlockBegin)); + } + + uptr UntaggedPtr = reinterpret_cast<uptr>(Ptr); + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) { + UntaggedPtr = untagPointer(UntaggedPtr); + BlockBegin = untagPointer(BlockBegin); + } + return SecondaryT::getBlockEnd(BlockBegin) - UntaggedPtr; + } + + // Return the usable size for a given chunk. If MTE is enabled or if the + // ExactUsableSize config parameter is true, we report the exact size of + // the original allocation size. Otherwise, we will return the total + // actual usable size. uptr getUsableSize(const void *Ptr) { if (UNLIKELY(!Ptr)) return 0; - return getAllocSize(Ptr); + if (AllocatorConfig::getExactUsableSize() || + UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load()))) + return getAllocSize(Ptr); + + initThreadMaybe(); + +#ifdef GWP_ASAN_HOOKS + if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr))) + return GuardedAlloc.getSize(Ptr); +#endif // GWP_ASAN_HOOKS + + Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr)); + Chunk::UnpackedHeader Header; + Chunk::loadHeader(Cookie, Ptr, &Header); + + // Getting the alloc size of a chunk only makes sense if it's allocated. + if (UNLIKELY(Header.State != Chunk::State::Allocated)) + reportInvalidChunkState(AllocatorAction::Sizing, Ptr); + + return getUsableSize(Ptr, &Header); } uptr getAllocSize(const void *Ptr) { @@ -951,6 +992,19 @@ class Allocator { MemorySize, 2, 16); } + uptr getBlockBeginTestOnly(const void *Ptr) { + Chunk::UnpackedHeader Header; + Chunk::loadHeader(Cookie, Ptr, &Header); + DCHECK(Header.State == Chunk::State::Allocated); + + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) + Ptr = untagPointer(const_cast<void *>(Ptr)); + void *Begin = getBlockBegin(Ptr, &Header); + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) + Begin = untagPointer(Begin); + return reinterpret_cast<uptr>(Begin); + } + private: typedef typename PrimaryT::SizeClassMap SizeClassMap; diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index 5fdfd1e7c55cc..4837ac96b9b26 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -1152,6 +1152,248 @@ TEST(ScudoCombinedTest, QuarantineDisabled) { EXPECT_EQ(Stats.find("Stats: Quarantine"), std::string::npos); } +struct UsableSizeClassConfig { + static const scudo::uptr NumBits = 1; + static const scudo::uptr MinSizeLog = 10; + static const scudo::uptr MidSizeLog = 10; + static const scudo::uptr MaxSizeLog = 13; + static const scudo::u16 MaxNumCachedHint = 8; + static const scudo::uptr MaxBytesCachedLog = 12; + static const scudo::uptr SizeDelta = 0; +}; + +struct TestExactUsableSizeConfig { + static const bool MaySupportMemoryTagging = false; + static const bool QuarantineDisabled = true; + + template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>; + + struct Primary { + // In order to properly test the usable size, this Primary config has + // four real size classes: 1024, 2048, 4096, 8192. + using SizeClassMap = scudo::FixedSizeClassMap<UsableSizeClassConfig>; + static const scudo::uptr RegionSizeLog = 21U; + static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN; + static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX; + typedef scudo::uptr CompactPtrT; + static const scudo::uptr CompactPtrScale = 0; + static const bool EnableRandomOffset = true; + static const scudo::uptr MapSizeIncrement = 1UL << 18; + static const scudo::uptr GroupSizeLog = 18; + }; + template <typename Config> + using PrimaryT = scudo::SizeClassAllocator64<Config>; + + struct Secondary { + template <typename Config> + using CacheT = scudo::MapAllocatorNoCache<Config>; + }; + + template <typename Config> using SecondaryT = scudo::MapAllocator<Config>; +}; + +template <class AllocatorT> void VerifyExactUsableSize(AllocatorT &Allocator) { + // Scan through all sizes up to 10000 then some larger sizes. + for (scudo::uptr Size = 1; Size < 10000; Size++) { + void *P = Allocator.allocate(Size, Origin); + EXPECT_EQ(Size, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << Size; + Allocator.deallocate(P, Origin); + } + + // Verify that aligned allocations also return the exact size allocated. + const scudo::uptr AllocSize = 313; + for (scudo::uptr Align = 1; Align <= 8; Align++) { + void *P = Allocator.allocate(AllocSize, Origin, 1U << Align); + EXPECT_EQ(AllocSize, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << AllocSize << " at align " + << 1 << Align; + Allocator.deallocate(P, Origin); + } + + // Verify an explicitly large allocations. + const scudo::uptr LargeAllocSize = 1000000; + void *P = Allocator.allocate(LargeAllocSize, Origin); + EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P)); + Allocator.deallocate(P, Origin); + + // Now do it for aligned allocations for large allocations. + for (scudo::uptr Align = 1; Align <= 8; Align++) { + void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Align); + EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << AllocSize << " at align " + << 1 << Align; + Allocator.deallocate(P, Origin); + } +} + +template <class AllocatorT> +void VerifyIterateOverUsableSize(AllocatorT &Allocator) { + // This will not verify if the size is the exact size or the size of the + // size class. Instead verify that the size matches the usable size and + // assume the other tests have verified getUsableSize. + std::unordered_map<void *, size_t> Pointers; + Pointers.insert({Allocator.allocate(128, Origin), 0U}); + Pointers.insert({Allocator.allocate(128, Origin, 32), 0U}); + Pointers.insert({Allocator.allocate(2000, Origin), 0U}); + Pointers.insert({Allocator.allocate(2000, Origin, 64), 0U}); + Pointers.insert({Allocator.allocate(8000, Origin), 0U}); + Pointers.insert({Allocator.allocate(8000, Origin, 128), 0U}); + Pointers.insert({Allocator.allocate(2000205, Origin), 0U}); + Pointers.insert({Allocator.allocate(2000205, Origin, 128), 0U}); + Pointers.insert({Allocator.allocate(2000205, Origin, 256), 0U}); + + Allocator.disable(); + Allocator.iterateOverChunks( + 0, static_cast<scudo::uptr>(SCUDO_MMAP_RANGE_SIZE - 1), + [](uintptr_t Base, size_t Size, void *Arg) { + std::unordered_map<void *, size_t> *Pointers = + reinterpret_cast<std::unordered_map<void *, size_t> *>(Arg); + (*Pointers)[reinterpret_cast<void *>(Base)] = Size; + }, + reinterpret_cast<void *>(&Pointers)); + Allocator.enable(); + + for (auto [Ptr, IterateSize] : Pointers) { + EXPECT_NE(0U, IterateSize) + << "Pointer " << Ptr << " not found in iterateOverChunks call."; + EXPECT_EQ(IterateSize, Allocator.getUsableSize(Ptr)) + << "Pointer " << Ptr + << " mismatch between iterate size and usable size."; + Allocator.deallocate(Ptr, Origin); + } +} + +TEST(ScudoCombinedTest, ExactUsableSize) { + using AllocatorT = scudo::Allocator<TestExactUsableSizeConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + VerifyExactUsableSize<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} + +struct TestExactUsableSizeMTEConfig : TestExactUsableSizeConfig { + static const bool MaySupportMemoryTagging = true; +}; + +TEST(ScudoCombinedTest, ExactUsableSizeMTE) { + if (!scudo::archSupportsMemoryTagging() || + !scudo::systemDetectsMemoryTagFaultsTestOnly()) + TEST_SKIP("Only supported on systems that can enable MTE."); + + scudo::enableSystemMemoryTaggingTestOnly(); + + using AllocatorT = scudo::Allocator<TestExactUsableSizeMTEConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + VerifyExactUsableSize<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} + +template <class AllocatorT> +void VerifyUsableSizePrimary(AllocatorT &Allocator) { + std::vector<scudo::uptr> SizeClasses = {1024U, 2048U, 4096U, 8192U}; + for (size_t I = 0; I < SizeClasses.size(); I++) { + scudo::uptr SizeClass = SizeClasses[I]; + scudo::uptr StartSize; + if (I == 0) + StartSize = 1; + else + StartSize = SizeClasses[I - 1]; + scudo::uptr UsableSize = SizeClass - scudo::Chunk::getHeaderSize(); + for (scudo::uptr Size = StartSize; Size < UsableSize; Size++) { + void *P = Allocator.allocate(Size, Origin); + EXPECT_EQ(UsableSize, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << Size + << " for size class " << SizeClass; + memset(P, 0xff, UsableSize); + EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass, + reinterpret_cast<scudo::uptr>(P) + UsableSize); + Allocator.deallocate(P, Origin); + } + + StartSize = UsableSize + 1; + } + + std::vector<scudo::uptr> Alignments = {32U, 128U}; + for (size_t I = 0; I < SizeClasses.size(); I++) { + scudo::uptr SizeClass = SizeClasses[I]; + scudo::uptr AllocSize; + if (I == 0) + AllocSize = 1; + else + AllocSize = SizeClasses[I - 1] + 1; + + for (auto Alignment : Alignments) { + void *P = Allocator.allocate(AllocSize, Origin, Alignment); + scudo::uptr UsableSize = Allocator.getUsableSize(P); + memset(P, 0xff, UsableSize); + EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass, + reinterpret_cast<scudo::uptr>(P) + UsableSize) + << "Failed usable size at allocation size " << AllocSize + << " for size class " << SizeClass << " at alignment " << Alignment; + Allocator.deallocate(P, Origin); + } + } +} + +template <class AllocatorT> +void VerifyUsableSizeSecondary(AllocatorT &Allocator) { + const scudo::uptr LargeAllocSize = 996780; + const scudo::uptr PageSize = scudo::getPageSizeCached(); + void *P = Allocator.allocate(LargeAllocSize, Origin); + scudo::uptr UsableSize = Allocator.getUsableSize(P); + memset(P, 0xff, UsableSize); + // Assumes that the secondary always rounds up allocations to a page boundary. + EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize, + PageSize), + reinterpret_cast<scudo::uptr>(P) + UsableSize); + Allocator.deallocate(P, Origin); + + // Check aligned allocations now. + for (scudo::uptr Alignment = 1; Alignment <= 8; Alignment++) { + void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Alignment); + scudo::uptr UsableSize = Allocator.getUsableSize(P); + EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize, + PageSize), + reinterpret_cast<scudo::uptr>(P) + UsableSize) + << "Failed usable size at allocation size " << LargeAllocSize + << " at alignment " << Alignment; + Allocator.deallocate(P, Origin); + } +} + +struct TestFullUsableSizeConfig : TestExactUsableSizeConfig { + static const bool ExactUsableSize = false; +}; + +TEST(ScudoCombinedTest, FullUsableSize) { + using AllocatorT = scudo::Allocator<TestFullUsableSizeConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + VerifyUsableSizePrimary<AllocatorT>(*Allocator); + VerifyUsableSizeSecondary<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} + +struct TestFullUsableSizeMTEConfig : TestFullUsableSizeConfig { + static const bool MaySupportMemoryTagging = true; +}; + +TEST(ScudoCombinedTest, FullUsableSizeMTE) { + if (!scudo::archSupportsMemoryTagging() || + !scudo::systemDetectsMemoryTagFaultsTestOnly()) + TEST_SKIP("Only supported on systems that can enable MTE."); + + scudo::enableSystemMemoryTaggingTestOnly(); + + using AllocatorT = scudo::Allocator<TestFullUsableSizeMTEConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + // When MTE is enabled, you get exact sizes. + VerifyExactUsableSize<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} // Verify that no special quarantine blocks appear in iterateOverChunks. TEST(ScudoCombinedTest, QuarantineIterateOverChunks) { using AllocatorT = TestAllocator<TestQuarantineConfig>; diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp index 612317b3c3293..9e5d0658e5ed5 100644 --- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp @@ -588,8 +588,13 @@ TEST_F(ScudoWrappersCTest, MallocInfo) { EXPECT_EQ(errno, 0); fclose(F); EXPECT_EQ(strncmp(Buffer, "<malloc version=\"scudo-", 23), 0); - EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"1234\" count=\"")); - EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"4321\" count=\"")); + std::string expected; + expected = + "<alloc size=\"" + std::to_string(malloc_usable_size(P1)) + "\" count=\""; + EXPECT_NE(nullptr, strstr(Buffer, expected.c_str())); + expected = + "<alloc size=\"" + std::to_string(malloc_usable_size(P2)) + "\" count=\""; + EXPECT_NE(nullptr, strstr(Buffer, expected.c_str())); free(P1); free(P2); From e987ab11a6f3d3965ef26fc42c82db3e8b1d56f5 Mon Sep 17 00:00:00 2001 From: Michael Kruse <llvm-project@meinersbur.de> Date: Mon, 3 Nov 2025 23:34:50 +0100 Subject: [PATCH 093/313] [Polly] Introduce PhaseManager and remove LPM support (#125442) Instead of relying on any pass manager to schedule Polly's passes, add Polly's own pipeline manager which is seen as a monolithic pass in LLVM's pass manager. Polly's former passes are now phases of the new PhaseManager component. Relying on LLVM's pass manager (the legacy as well as the New Pass Manager) to manage Polly's phases never was a good fit that the PhaseManager resolves: * Polly passes were modifying analysis results, in particular RegionInfo and ScopInfo. This means that there was not just one unique and "definite" analysis result, the actual result depended on which analyses ran prior, and the pass manager was not allowed to throw away cached analyses or prior SCoP optimizations would have been forgotten. The LLVM pass manger's persistance of analysis results is not contractual but designed for caching. * Polly depends on a particular execution order of passes and regions (e.g. regression tests, invalidation of consecutive SCoPs). LLVM's pass manager does not guarantee any excecution order. * Polly does not completely preserve DominatorTree, RegionInfo, LoopInfo, or ScalarEvolution, but only as-needed for Polly's own uses. Because the ScopDetection object stores references to those analyses, it still had to lie to the pass manager that they would be preserved, or the pass manager would have released and recomputed the invalidated analysis objects that ScopDetection/ScopInfo was still referencing. To ensure that no non-Polly pass would see these not-completely-preserved analyses, all analyses still had to be thrown away after the ScopPassManager, respectively with a BarrierNoopPass in case of the LPM. * The NPM's PassInstrumentation wraps the IR unit into an `llvm::Any` object, but implementations such as PrintIRInstrumentation call llvm_unreachable on encountering an unknown IR unit, such as SCoPs, with no extension points to add support. Hence LLVM crashes when dumping IR between SCoP passes (such as `-print-before-changed` with Polly being active). The new PhaseManager uses some command line options that previously belonged to Polly's legacy passes, such as `-polly-print-detect` (so the option will continue to work). Hence the LPM support is incompatible with the new approach and support for it is removed. --- polly/docs/ReleaseNotes.rst | 4 + polly/include/polly/Canonicalization.h | 8 - polly/include/polly/CodeGen/CodeGeneration.h | 3 + polly/include/polly/CodeGen/IslAst.h | 36 +- polly/include/polly/CodePreparation.h | 6 + polly/include/polly/DeLICM.h | 11 +- polly/include/polly/DeadCodeElimination.h | 13 +- polly/include/polly/DependenceInfo.h | 106 +---- polly/include/polly/FlattenSchedule.h | 16 +- polly/include/polly/ForwardOpTree.h | 20 +- polly/include/polly/JSONExporter.h | 13 +- polly/include/polly/LinkAllPasses.h | 156 ------- polly/include/polly/MaximalStaticExpansion.h | 2 + polly/include/polly/Pass/PhaseManager.h | 127 ++++++ polly/include/polly/Pass/PollyFunctionPass.h | 32 ++ polly/include/polly/Pass/PollyModulePass.h | 30 ++ polly/include/polly/PruneUnprofitable.h | 12 +- polly/include/polly/RegisterPasses.h | 2 - polly/include/polly/ScheduleOptimizer.h | 16 +- polly/include/polly/ScopDetection.h | 27 -- polly/include/polly/ScopGraphPrinter.h | 3 + polly/include/polly/ScopInfo.h | 76 ---- polly/include/polly/ScopInliner.h | 6 - polly/include/polly/ScopPass.h | 28 -- polly/include/polly/Simplify.h | 23 +- .../include/polly/Support/DumpFunctionPass.h | 12 - polly/include/polly/Support/DumpModulePass.h | 15 +- polly/include/polly/Support/ScopHelper.h | 8 - polly/lib/Analysis/DependenceInfo.cpp | 220 +-------- polly/lib/Analysis/PruneUnprofitable.cpp | 32 +- polly/lib/Analysis/ScopBuilder.cpp | 1 + polly/lib/Analysis/ScopDetection.cpp | 107 ----- polly/lib/Analysis/ScopGraphPrinter.cpp | 120 +---- polly/lib/Analysis/ScopInfo.cpp | 228 ---------- polly/lib/Analysis/ScopPass.cpp | 36 -- polly/lib/CMakeLists.txt | 3 + polly/lib/CodeGen/CodeGeneration.cpp | 72 +-- polly/lib/CodeGen/IslAst.cpp | 101 +---- polly/lib/Exchange/JSONExporter.cpp | 160 +------ polly/lib/Pass/PhaseManager.cpp | 424 ++++++++++++++++++ polly/lib/Pass/PollyFunctionPass.cpp | 22 + polly/lib/Pass/PollyModulePass.cpp | 29 ++ polly/lib/Support/DumpFunctionPass.cpp | 41 -- polly/lib/Support/DumpModulePass.cpp | 47 -- polly/lib/Support/PollyPasses.def | 25 +- polly/lib/Support/RegisterPasses.cpp | 401 +++++++++++------ polly/lib/Support/ScopHelper.cpp | 12 - polly/lib/Transform/Canonicalization.cpp | 65 --- polly/lib/Transform/CodePreparation.cpp | 69 --- polly/lib/Transform/DeLICM.cpp | 125 ++---- polly/lib/Transform/DeadCodeElimination.cpp | 44 +- polly/lib/Transform/FlattenSchedule.cpp | 139 ++---- polly/lib/Transform/ForwardOpTree.cpp | 131 ++---- .../lib/Transform/MaximalStaticExpansion.cpp | 81 +--- polly/lib/Transform/ScheduleOptimizer.cpp | 152 +------ polly/lib/Transform/ScopInliner.cpp | 46 -- polly/lib/Transform/Simplify.cpp | 101 +---- polly/test/CodeGen/20100617.ll | 2 +- polly/test/CodeGen/20100622.ll | 4 +- polly/test/CodeGen/20100707.ll | 2 +- polly/test/CodeGen/20100707_2.ll | 2 +- polly/test/CodeGen/20100708.ll | 2 +- polly/test/CodeGen/20100708_2.ll | 2 +- polly/test/CodeGen/20100713.ll | 2 +- polly/test/CodeGen/20100713_2.ll | 2 +- polly/test/CodeGen/20100717.ll | 2 +- polly/test/CodeGen/20100718-DomInfo-2.ll | 2 +- polly/test/CodeGen/20100718-DomInfo.ll | 2 +- .../CodeGen/20100720-MultipleConditions.ll | 2 +- .../test/CodeGen/20100809-IndependentBlock.ll | 2 +- ...0100811-ScalarDependencyBetweenBrAndCnd.ll | 2 +- polly/test/CodeGen/20101030-Overflow.ll | 2 +- polly/test/CodeGen/20101103-Overflow3.ll | 2 +- polly/test/CodeGen/20101103-signmissmatch.ll | 2 +- .../test/CodeGen/20110226-Ignore-Dead-Code.ll | 2 +- .../test/CodeGen/20110226-PHI-Node-removed.ll | 2 +- polly/test/CodeGen/20120316-InvalidCast.ll | 2 +- .../CodeGen/20120403-RHS-type-mismatch.ll | 2 +- polly/test/CodeGen/20130221.ll | 2 +- .../20150328-SCEVExpanderIntroducesNewIV.ll | 2 +- polly/test/CodeGen/Intrinsics/llvm-expect.ll | 2 +- .../do_not_mutate_debug_info.ll | 2 +- .../loop_nest_param_parallel.ll | 2 +- .../single_loop_param_parallel.ll | 4 +- polly/test/CodeGen/MemAccess/bad_alignment.ll | 2 +- .../MemAccess/codegen_address_space.ll | 2 +- .../MemAccess/codegen_constant_offset.ll | 2 +- .../test/CodeGen/MemAccess/codegen_simple.ll | 2 +- .../CodeGen/MemAccess/codegen_simple_float.ll | 2 +- .../CodeGen/MemAccess/codegen_simple_md.ll | 4 +- .../MemAccess/codegen_simple_md_float.ll | 4 +- polly/test/CodeGen/MemAccess/create_arrays.ll | 4 +- .../CodeGen/MemAccess/create_arrays_heap.ll | 4 +- .../default_aligned_new_access_function.ll | 2 +- .../test/CodeGen/MemAccess/different_types.ll | 4 +- polly/test/CodeGen/MemAccess/generate-all.ll | 6 +- .../CodeGen/MemAccess/invariant_base_ptr.ll | 4 +- .../CodeGen/MemAccess/map_scalar_access.ll | 4 +- .../test/CodeGen/MemAccess/multiple_types.ll | 4 +- polly/test/CodeGen/MemAccess/simple.ll | 2 +- .../test/CodeGen/MemAccess/simple_analyze.ll | 2 +- .../MemAccess/update_access_functions.ll | 4 +- .../CodeGen/Metadata/basic_vec_annotate.ll | 2 +- polly/test/CodeGen/OpenMP/alias-metadata.ll | 2 +- .../floord-as-argument-to-subfunction.ll | 2 +- polly/test/CodeGen/OpenMP/inlineasm.ll | 2 +- .../invariant_base_pointer_preloaded.ll | 3 +- ...ant_base_pointer_preloaded_different_bb.ll | 3 +- ...base_pointer_preloaded_pass_only_needed.ll | 3 +- .../invariant_base_pointers_preloaded.ll | 3 +- .../OpenMP/loop-body-references-outer-iv.ll | 4 +- .../loop-body-references-outer-values-2.ll | 4 +- .../loop-body-references-outer-values-3.ll | 4 +- .../loop-body-references-outer-values.ll | 4 +- .../OpenMP/loop-bounds-reference-outer-ids.ll | 4 +- .../test/CodeGen/OpenMP/mapped-phi-access.ll | 2 +- polly/test/CodeGen/OpenMP/matmul-parallel.ll | 4 +- .../CodeGen/OpenMP/new_multidim_access.ll | 8 +- polly/test/CodeGen/OpenMP/recomputed-srem.ll | 3 +- ...ference-argument-from-non-affine-region.ll | 19 +- .../test/CodeGen/OpenMP/reference-other-bb.ll | 2 +- .../OpenMP/reference-preceeding-loop.ll | 4 +- polly/test/CodeGen/OpenMP/reference_latest.ll | 2 +- polly/test/CodeGen/OpenMP/scev-rewriting.ll | 2 +- polly/test/CodeGen/OpenMP/single_loop.ll | 18 +- ...single_loop_with_loop_invariant_baseptr.ll | 4 +- .../CodeGen/OpenMP/single_loop_with_param.ll | 16 +- ...o-parallel-loops-reference-outer-indvar.ll | 4 +- polly/test/CodeGen/PHIInExit.ll | 2 +- .../combine_different_values.ll | 4 +- .../RuntimeDebugBuilder/stmt_tracing.ll | 2 +- polly/test/CodeGen/alias-check-multi-dim.ll | 3 +- .../CodeGen/alias_metadata_too_many_arrays.ll | 3 +- ...aliasing_different_base_and_access_type.ll | 2 +- .../aliasing_different_pointer_types.ll | 2 +- .../aliasing_multidimensional_access.ll | 2 +- .../CodeGen/aliasing_parametric_simple_1.ll | 2 +- .../CodeGen/aliasing_parametric_simple_2.ll | 2 +- polly/test/CodeGen/aliasing_struct_element.ll | 2 +- polly/test/CodeGen/alignment.ll | 2 +- polly/test/CodeGen/annotated_alias_scopes.ll | 2 +- polly/test/CodeGen/blas_sscal_simplified.ll | 2 +- ...code-hosting-and-escape-map-computation.ll | 2 +- polly/test/CodeGen/constant_condition.ll | 2 +- polly/test/CodeGen/create-conditional-scop.ll | 2 +- ...d_instruction_referenced_by_parameter_1.ll | 2 +- ...d_instruction_referenced_by_parameter_2.ll | 2 +- polly/test/CodeGen/debug-intrinsics.ll | 8 +- ...nce_problem_after_early_codegen_bailout.ll | 2 +- polly/test/CodeGen/empty_domain_in_context.ll | 2 +- polly/test/CodeGen/entry_with_trivial_phi.ll | 2 +- .../entry_with_trivial_phi_other_bb.ll | 2 +- .../error-stmt-in-non-affine-region.ll | 2 +- ...or_block_contains_invalid_memory_access.ll | 2 +- polly/test/CodeGen/exprModDiv.ll | 7 +- .../hoisted_load_escapes_through_phi.ll | 6 +- polly/test/CodeGen/hoisting_1.ll | 2 +- polly/test/CodeGen/hoisting_2.ll | 2 +- polly/test/CodeGen/inner_scev_sdiv_1.ll | 2 +- polly/test/CodeGen/inner_scev_sdiv_2.ll | 2 +- polly/test/CodeGen/inner_scev_sdiv_3.ll | 2 +- polly/test/CodeGen/inner_scev_sdiv_in_lb.ll | 4 +- .../inner_scev_sdiv_in_lb_invariant.ll | 3 +- polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll | 3 +- polly/test/CodeGen/intrinsics_lifetime.ll | 2 +- polly/test/CodeGen/intrinsics_misc.ll | 2 +- .../inv-load-lnt-crash-wrong-order-2.ll | 3 +- .../inv-load-lnt-crash-wrong-order-3.ll | 3 +- .../CodeGen/inv-load-lnt-crash-wrong-order.ll | 3 +- .../test/CodeGen/invariant-load-dimension.ll | 4 +- ...-load-preload-base-pointer-origin-first.ll | 2 +- .../CodeGen/invariant_cannot_handle_void.ll | 4 +- polly/test/CodeGen/invariant_load.ll | 2 +- .../CodeGen/invariant_load_address_space.ll | 2 +- .../CodeGen/invariant_load_alias_metadata.ll | 3 +- .../CodeGen/invariant_load_base_pointer.ll | 2 +- ...invariant_load_base_pointer_conditional.ll | 2 +- ...variant_load_base_pointer_conditional_2.ll | 6 +- ...ariant_load_canonicalize_array_baseptrs.ll | 4 +- .../test/CodeGen/invariant_load_condition.ll | 2 +- .../invariant_load_different_sized_types.ll | 3 +- polly/test/CodeGen/invariant_load_escaping.ll | 2 +- .../invariant_load_escaping_second_scop.ll | 2 +- .../invariant_load_in_non_affine_subregion.ll | 2 +- polly/test/CodeGen/invariant_load_loop_ub.ll | 2 +- ...ant_load_not_executed_but_in_parameters.ll | 2 +- .../test/CodeGen/invariant_load_outermost.ll | 2 +- ...riant_load_parameters_cyclic_dependence.ll | 4 +- .../CodeGen/invariant_load_ptr_ptr_noalias.ll | 2 +- .../test/CodeGen/invariant_load_scalar_dep.ll | 2 +- ...riant_load_scalar_escape_alloca_sharing.ll | 2 +- ...oads_from_struct_with_different_types_1.ll | 2 +- ...oads_from_struct_with_different_types_2.ll | 2 +- ...invariant_loads_ignore_parameter_bounds.ll | 3 +- .../invariant_verify_function_failed.ll | 2 +- .../invariant_verify_function_failed_2.ll | 4 +- polly/test/CodeGen/issue56692.ll | 2 +- .../large-numbers-in-boundary-context.ll | 2 +- .../test/CodeGen/load_subset_with_context.ll | 2 +- .../loop-invariant-load-type-mismatch.ll | 2 +- polly/test/CodeGen/loop_with_condition.ll | 2 +- polly/test/CodeGen/loop_with_condition_2.ll | 2 +- .../test/CodeGen/loop_with_condition_ineq.ll | 2 +- .../CodeGen/loop_with_condition_nested.ll | 4 +- ..._conditional_entry_edge_split_hard_case.ll | 2 +- polly/test/CodeGen/memcpy_annotations.ll | 2 +- .../multidim-non-matching-typesize-2.ll | 3 +- .../CodeGen/multidim-non-matching-typesize.ll | 3 +- ..._2d_parametric_array_static_loop_bounds.ll | 2 +- polly/test/CodeGen/multidim_alias_check.ll | 2 +- polly/test/CodeGen/multiple-codegens.ll | 4 +- polly/test/CodeGen/multiple-scops-in-a-row.ll | 2 +- .../multiple-types-invariant-load-2.ll | 3 +- .../CodeGen/multiple-types-invariant-load.ll | 3 +- .../multiple_sai_fro_same_base_address.ll | 4 +- polly/test/CodeGen/no-overflow-tracking.ll | 4 +- polly/test/CodeGen/no_guard_bb.ll | 2 +- ...non-affine-dominance-generated-entering.ll | 2 +- .../CodeGen/non-affine-exit-node-dominance.ll | 2 +- .../non-affine-phi-node-expansion-2.ll | 3 +- .../non-affine-phi-node-expansion-3.ll | 3 +- .../non-affine-phi-node-expansion-4.ll | 3 +- .../CodeGen/non-affine-phi-node-expansion.ll | 3 +- ...e-region-exit-phi-incoming-synthesize-2.ll | 2 +- ...ine-region-exit-phi-incoming-synthesize.ll | 2 +- .../non-affine-region-implicit-store.ll | 2 +- ...ine-region-phi-references-in-scop-value.ll | 3 +- .../non-affine-subregion-dominance-reuse.ll | 3 +- polly/test/CodeGen/non-affine-switch.ll | 3 +- .../non-affine-synthesized-in-branch.ll | 2 +- polly/test/CodeGen/non-affine-update.ll | 3 +- .../non-hoisted-load-needed-as-base-ptr.ll | 2 +- .../test/CodeGen/non_affine_float_compare.ll | 4 +- .../CodeGen/only_non_affine_error_region.ll | 2 +- polly/test/CodeGen/openmp_limit_threads.ll | 12 +- .../test/CodeGen/out-of-scop-phi-node-use.ll | 2 +- polly/test/CodeGen/param_div_div_div_2.ll | 4 +- polly/test/CodeGen/partial_write_array.ll | 2 +- polly/test/CodeGen/partial_write_emptyset.ll | 2 +- ...l_write_full_write_that_appears_partial.ll | 2 +- .../partial_write_impossible_restriction.ll | 2 +- polly/test/CodeGen/partial_write_in_region.ll | 5 +- .../partial_write_in_region_with_loop.ll | 5 +- .../CodeGen/partial_write_mapped_scalar.ll | 2 +- .../partial_write_mapped_scalar_subregion.ll | 2 +- polly/test/CodeGen/perf_monitoring.ll | 3 +- .../perf_monitoring_cycles_per_scop.ll | 3 +- .../perf_monitoring_trip_counts_per_scop.ll | 3 +- polly/test/CodeGen/phi-defined-before-scop.ll | 2 +- .../phi_after_error_block_outside_of_scop.ll | 2 +- .../test/CodeGen/phi_condition_modeling_1.ll | 2 +- .../test/CodeGen/phi_condition_modeling_2.ll | 2 +- .../test/CodeGen/phi_conditional_simple_1.ll | 4 +- .../phi_in_exit_early_lnt_failure_1.ll | 2 +- .../phi_in_exit_early_lnt_failure_2.ll | 2 +- .../phi_in_exit_early_lnt_failure_3.ll | 2 +- .../phi_in_exit_early_lnt_failure_5.ll | 2 +- polly/test/CodeGen/phi_loop_carried_float.ll | 2 +- .../CodeGen/phi_loop_carried_float_escape.ll | 6 +- polly/test/CodeGen/phi_scalar_simple_1.ll | 2 +- polly/test/CodeGen/phi_scalar_simple_2.ll | 2 +- .../CodeGen/phi_with_multi_exiting_edges_2.ll | 2 +- polly/test/CodeGen/phi_with_one_exit_edge.ll | 2 +- .../CodeGen/pointer-type-expressions-2.ll | 4 +- .../test/CodeGen/pointer-type-expressions.ll | 4 +- .../pointer-type-pointer-type-comparison.ll | 4 +- polly/test/CodeGen/pointer_rem.ll | 4 +- polly/test/CodeGen/pr25241.ll | 2 +- polly/test/CodeGen/ptrtoint_as_parameter.ll | 2 +- polly/test/CodeGen/read-only-scalars.ll | 8 +- polly/test/CodeGen/reduction.ll | 2 +- polly/test/CodeGen/reduction_2.ll | 2 +- polly/test/CodeGen/reduction_simple_binary.ll | 2 +- polly/test/CodeGen/reggen_domtree_crash.ll | 2 +- .../test/CodeGen/region-with-instructions.ll | 2 +- polly/test/CodeGen/region_exiting-domtree.ll | 2 +- .../CodeGen/region_multiexit_partialwrite.ll | 2 +- ...run-time-condition-with-scev-parameters.ll | 4 +- polly/test/CodeGen/run-time-condition.ll | 2 +- .../scalar-references-used-in-scop-compute.ll | 2 +- .../test/CodeGen/scalar-store-from-same-bb.ll | 3 +- polly/test/CodeGen/scalar_codegen_crash.ll | 3 +- polly/test/CodeGen/scev-backedgetaken.ll | 2 +- .../CodeGen/scev-division-invariant-load.ll | 2 +- polly/test/CodeGen/scev.ll | 2 +- .../CodeGen/scev_expansion_in_nonaffine.ll | 3 +- .../CodeGen/scev_looking_through_bitcasts.ll | 2 +- .../CodeGen/scop_expander_insert_point.ll | 3 +- polly/test/CodeGen/scop_expander_segfault.ll | 2 +- ...p_never_executed_runtime_check_location.ll | 2 +- polly/test/CodeGen/select-base-pointer.ll | 2 +- polly/test/CodeGen/sequential_loops.ll | 2 +- .../CodeGen/simple_loop_non_single_exit.ll | 2 +- .../CodeGen/simple_loop_non_single_exit_2.ll | 2 +- polly/test/CodeGen/simple_non_single_entry.ll | 2 +- polly/test/CodeGen/simple_nonaffine_loop.ll | 2 +- .../single_do_loop_int_max_iterations.ll | 2 +- .../single_do_loop_int_param_iterations.ll | 2 +- .../single_do_loop_ll_max_iterations.ll | 4 +- .../CodeGen/single_do_loop_one_iteration.ll | 2 +- .../CodeGen/single_do_loop_scev_replace.ll | 2 +- polly/test/CodeGen/single_loop.ll | 2 +- .../CodeGen/single_loop_int_max_iterations.ll | 2 +- .../CodeGen/single_loop_ll_max_iterations.ll | 2 +- .../test/CodeGen/single_loop_one_iteration.ll | 2 +- polly/test/CodeGen/single_loop_param.ll | 2 +- .../CodeGen/single_loop_param_less_equal.ll | 6 +- .../CodeGen/single_loop_param_less_than.ll | 4 +- .../CodeGen/single_loop_zero_iterations.ll | 2 +- polly/test/CodeGen/split_edge_of_exit.ll | 4 +- polly/test/CodeGen/split_edges.ll | 2 +- polly/test/CodeGen/split_edges_2.ll | 2 +- polly/test/CodeGen/srem-in-other-bb.ll | 3 +- .../stack-overflow-in-load-hoisting.ll | 3 +- .../test/CodeGen/stmt_split_no_dependence.ll | 2 +- .../CodeGen/switch-in-non-affine-region.ll | 3 +- .../synthesizable_phi_write_after_loop.ll | 2 +- .../test-invalid-operands-for-select-2.ll | 2 +- .../test-invalid-operands-for-select.ll | 2 +- polly/test/CodeGen/test.ll | 2 +- .../two-loops-right-after-each-other-2.ll | 2 +- .../two-scops-in-row-invalidate-scevs.ll | 2 +- polly/test/CodeGen/two-scops-in-row.ll | 4 +- polly/test/CodeGen/udiv_expansion_position.ll | 2 +- .../CodeGen/uninitialized_scalar_memory.ll | 2 +- .../unpredictable-loop-unsynthesizable.ll | 6 +- .../test/CodeGen/variant_load_empty_domain.ll | 2 +- .../whole-scop-non-affine-subregion.ll | 3 +- polly/test/DeLICM/confused_order.ll | 4 +- ...ontradicting_assumed_context_and_domain.ll | 2 +- polly/test/DeLICM/load-in-cond-inf-loop.ll | 2 +- polly/test/DeLICM/map_memset_zero.ll | 4 +- polly/test/DeLICM/nomap_alreadymapped.ll | 2 +- polly/test/DeLICM/nomap_escaping.ll | 2 +- polly/test/DeLICM/nomap_occupied.ll | 2 +- polly/test/DeLICM/nomap_readonly.ll | 2 +- polly/test/DeLICM/nomap_spuriouswrite.ll | 2 +- polly/test/DeLICM/nomap_storagesize.ll | 2 +- polly/test/DeLICM/nomap_writewrite.ll | 2 +- polly/test/DeLICM/outofquota-reverseDomain.ll | 2 +- polly/test/DeLICM/pass_existence.ll | 6 +- polly/test/DeLICM/pr41656.ll | 2 +- polly/test/DeLICM/pr48783.ll | 2 +- polly/test/DeLICM/reduction.ll | 2 +- .../DeLICM/reduction_constant_selfconflict.ll | 2 +- polly/test/DeLICM/reduction_looprotate.ll | 2 +- .../reduction_looprotate_alwaystaken.ll | 2 +- .../DeLICM/reduction_looprotate_gvnpre.ll | 4 +- .../reduction_looprotate_gvnpre_cond1.ll | 2 +- .../reduction_looprotate_gvnpre_cond2.ll | 2 +- ...reduction_looprotate_gvnpre_nopreheader.ll | 2 +- .../DeLICM/reduction_looprotate_hoisted.ll | 2 +- .../test/DeLICM/reduction_looprotate_licm.ll | 2 +- .../test/DeLICM/reduction_looprotate_licm2.ll | 2 +- .../reduction_looprotate_licm_double_write.ll | 5 +- .../reduction_looprotate_licm_nopreheader.ll | 2 +- .../test/DeLICM/reduction_looprotate_load.ll | 2 +- .../reduction_looprotate_loopguard_gvnpre.ll | 2 +- .../reduction_looprotate_loopguard_licm1.ll | 2 +- .../reduction_looprotate_loopguard_licm2.ll | 2 +- .../reduction_looprotate_loopguard_licm3.ll | 2 +- .../DeLICM/reduction_looprotate_readonly.ll | 2 +- .../reduction_looprotate_synthesizable.ll | 2 +- .../test/DeLICM/reduction_looprotate_undef.ll | 2 +- .../test/DeLICM/reduction_overapproximate.ll | 6 +- polly/test/DeLICM/reduction_preheader.ll | 2 +- .../test/DeLICM/reduction_unrelatedunusual.ll | 2 +- polly/test/DeLICM/reject_loadafterstore.ll | 2 +- polly/test/DeLICM/reject_outofquota.ll | 4 +- polly/test/DeLICM/reject_storeafterstore.ll | 2 +- polly/test/DeLICM/reject_storeinsubregion.ll | 2 +- polly/test/DeLICM/reject_unusualstore.ll | 4 +- polly/test/DeLICM/skip_maywrite.ll | 2 +- polly/test/DeLICM/skip_multiaccess.ll | 2 +- polly/test/DeLICM/skip_notinloop.ll | 2 +- polly/test/DeLICM/skip_scalaraccess.ll | 2 +- .../DeadCodeElimination/chained_iterations.ll | 4 +- .../chained_iterations_2.ll | 4 +- polly/test/DeadCodeElimination/computeout.ll | 4 +- .../dead_iteration_elimination.ll | 2 +- .../non-affine-affine-mix.ll | 2 +- polly/test/DeadCodeElimination/non-affine.ll | 2 +- .../test/DeadCodeElimination/null_schedule.ll | 2 +- polly/test/DependenceInfo/computeout.ll | 4 +- .../different_schedule_dimensions.ll | 3 +- polly/test/DependenceInfo/do_pluto_matmult.ll | 4 +- polly/test/DependenceInfo/fine_grain_dep_0.ll | 4 +- .../generate_may_write_dependence_info.ll | 2 +- .../test/DependenceInfo/infeasible_context.ll | 6 +- ...writes_do_not_block_must_writes_for_war.ll | 2 +- .../nonaffine-condition-buildMemoryAccess.ll | 2 +- .../reduction_complex_location.ll | 6 +- ...ndences_equal_non_reduction_dependences.ll | 2 +- .../reduction_dependences_not_null.ll | 2 +- .../reduction_indirect_access.ll | 2 +- ...reduction_and_non_reduction_dependences.ll | 2 +- .../reduction_multiple_loops_array_sum.ll | 6 +- .../reduction_multiple_loops_array_sum_2.ll | 2 +- .../reduction_multiple_loops_array_sum_3.ll | 2 +- .../reduction_multiple_reductions.ll | 2 +- .../reduction_multiple_reductions_2.ll | 2 +- .../reduction_only_reduction_like_access.ll | 2 +- ...lly_escaping_intermediate_in_other_stmt.ll | 2 +- .../reduction_privatization_deps.ll | 2 +- .../reduction_privatization_deps_2.ll | 2 +- .../reduction_privatization_deps_3.ll | 2 +- .../reduction_privatization_deps_4.ll | 2 +- .../reduction_privatization_deps_5.ll | 2 +- .../test/DependenceInfo/reduction_sequence.ll | 2 +- .../DependenceInfo/reduction_simple_iv.ll | 2 +- ...ion_simple_iv_debug_wrapped_dependences.ll | 2 +- .../reduction_simple_privatization_deps_2.ll | 2 +- ...n_simple_privatization_deps_w_parameter.ll | 2 +- ...duction_two_reductions_different_rloops.ll | 2 +- polly/test/DependenceInfo/sequential_loops.ll | 6 +- polly/test/FlattenSchedule/gemm.ll | 2 +- polly/test/ForwardOpTree/atax.ll | 2 +- polly/test/ForwardOpTree/changed-kind.ll | 2 +- .../test/ForwardOpTree/forward_from_region.ll | 2 +- polly/test/ForwardOpTree/forward_hoisted.ll | 2 +- .../test/ForwardOpTree/forward_instruction.ll | 2 +- .../test/ForwardOpTree/forward_into_region.ll | 2 +- .../forward_into_region_redundant_use.ll | 2 +- polly/test/ForwardOpTree/forward_load.ll | 2 +- .../forward_load_differentarray.ll | 2 +- .../forward_load_double_write.ll | 2 +- .../ForwardOpTree/forward_load_fromloop.ll | 2 +- .../ForwardOpTree/forward_load_indirect.ll | 2 +- .../forward_load_memset_after.ll | 2 +- .../forward_load_memset_before.ll | 2 +- .../ForwardOpTree/forward_load_tripleuse.ll | 2 +- .../forward_load_unrelatedunusual.ll | 2 +- polly/test/ForwardOpTree/forward_phi_load.ll | 2 +- polly/test/ForwardOpTree/forward_readonly.ll | 4 +- polly/test/ForwardOpTree/forward_reusue.ll | 2 +- polly/test/ForwardOpTree/forward_store.ll | 2 +- .../forward_synthesizable_definloop.ll | 2 +- .../forward_synthesizable_indvar.ll | 2 +- .../forward_synthesizable_useinloop.ll | 2 +- .../test/ForwardOpTree/forward_transitive.ll | 2 +- polly/test/ForwardOpTree/jacobi-1d.ll | 2 +- .../ForwardOpTree/noforward_from_region.ll | 2 +- .../noforward_load_conditional.ll | 2 +- .../noforward_load_writebetween.ll | 2 +- .../ForwardOpTree/noforward_outofquota.ll | 4 +- polly/test/ForwardOpTree/noforward_partial.ll | 2 +- polly/test/ForwardOpTree/noforward_phi.ll | 2 +- .../ForwardOpTree/noforward_selfrefphi.ll | 2 +- .../ForwardOpTree/noforward_sideffects.ll | 2 +- .../noforward_synthesizable_unknownit.ll | 2 +- polly/test/ForwardOpTree/out-of-quota1.ll | 2 +- .../OpenMP/multiple_loops_outer_parallel.ll | 2 +- .../OpenMP/nested_loop_both_parallel.ll | 2 +- .../nested_loop_both_parallel_parametric.ll | 2 +- .../OpenMP/nested_loop_inner_parallel.ll | 2 +- .../OpenMP/nested_loop_outer_parallel.ll | 2 +- .../OpenMP/single_loop_param_non_parallel.ll | 2 +- .../OpenMP/single_loop_param_parallel.ll | 2 +- .../single_loop_param_parallel_computeout.ll | 2 +- .../alias_checks_with_empty_context.ll | 3 +- polly/test/IstAstInfo/alias_simple_1.ll | 10 +- polly/test/IstAstInfo/alias_simple_2.ll | 12 +- polly/test/IstAstInfo/alias_simple_3.ll | 10 +- .../aliasing_arrays_with_identical_base.ll | 4 +- .../aliasing_multiple_alias_groups.ll | 4 +- .../aliasing_parametric_simple_1.ll | 2 +- .../aliasing_parametric_simple_2.ll | 2 +- .../dependence_distance_constant.ll | 2 +- .../IstAstInfo/dependence_distance_minimal.ll | 2 +- .../dependence_distance_multiple_constant.ll | 2 +- .../dependence_distance_parametric.ll | 2 +- .../dependence_distance_parametric_expr.ll | 2 +- .../IstAstInfo/dependence_distance_varying.ll | 2 +- ...pendence_distance_varying_in_outer_loop.ll | 2 +- .../dependence_distance_varying_multiple.ll | 2 +- .../domain_bounded_only_with_context.ll | 2 +- polly/test/IstAstInfo/non_affine_access.ll | 2 +- ...duction_clauses_multidimensional_access.ll | 2 +- ...reduction_clauses_onedimensional_access.ll | 2 +- ...ndences_equal_non_reduction_dependences.ll | 2 +- .../reduction_different_reduction_clauses.ll | 2 +- .../IstAstInfo/reduction_in_one_dimension.ll | 2 +- .../IstAstInfo/reduction_loop_reversal.ll | 2 +- ...ction_modulo_and_loop_reversal_schedule.ll | 2 +- ...ion_modulo_and_loop_reversal_schedule_2.ll | 2 +- .../IstAstInfo/reduction_modulo_schedule.ll | 2 +- ...ion_modulo_schedule_multiple_dimensions.ll | 2 +- ...n_modulo_schedule_multiple_dimensions_2.ll | 2 +- ...n_modulo_schedule_multiple_dimensions_3.ll | 2 +- ...n_modulo_schedule_multiple_dimensions_4.ll | 2 +- ...n_modulo_schedule_multiple_dimensions_5.ll | 2 +- .../reduction_multiple_dimensions.ll | 2 +- .../reduction_multiple_dimensions_2.ll | 2 +- .../reduction_multiple_dimensions_3.ll | 2 +- .../reduction_multiple_dimensions_4.ll | 2 +- polly/test/IstAstInfo/run-time-condition.ll | 2 +- .../runtime_context_with_error_blocks.ll | 2 +- .../IstAstInfo/simple-run-time-condition.ll | 2 +- .../test/IstAstInfo/single_loop_strip_mine.ll | 4 +- .../single_loop_uint_max_iterations.ll | 2 +- .../single_loop_ull_max_iterations.ll | 2 +- .../ImportAccesses-Bad-relation.ll | 2 +- .../ImportAccesses-No-accesses-key.ll | 2 +- .../ImportAccesses-Not-enough-MemAcc.ll | 2 +- .../ImportAccesses-Not-enough-statements.ll | 2 +- .../ImportAccesses-Relation-mispelled.ll | 2 +- .../ImportAccesses-Statements-mispelled.ll | 2 +- ...ImportAccesses-Undeclared-ScopArrayInfo.ll | 2 +- .../ImportAccesses-Wrong-number-dimensions.ll | 2 +- .../ImportArrays-Mispelled-type.ll | 2 +- .../ImportArrays-Negative-size.ll | 2 +- .../ImportArrays/ImportArrays-No-name.ll | 2 +- .../ImportArrays/ImportArrays-No-sizes-key.ll | 2 +- .../ImportArrays/ImportArrays-No-type-key.ll | 2 +- .../ImportContext-Context-mispelled.ll | 2 +- .../ImportContext-Not-parameter-set.ll | 2 +- .../ImportContext-Unvalid-Context.ll | 2 +- .../ImportContext-Wrong-dimension.ll | 2 +- .../ImportSchedule-No-schedule-key.ll | 2 +- .../ImportSchedule-Schedule-not-valid.ll | 2 +- .../ImportSchedule-Statements-mispelled.ll | 2 +- .../ImportSchedule-Wrong-number-statements.ll | 2 +- .../load_after_store_same_statement.ll | 4 +- .../read_from_original.ll | 4 +- .../MaximalStaticExpansion/too_many_writes.ll | 4 +- .../working_deps_between_inners.ll | 2 +- .../working_deps_between_inners_phi.ll | 4 +- .../working_expansion.ll | 2 +- ...sion_multiple_dependences_per_statement.ll | 2 +- ...sion_multiple_instruction_per_statement.ll | 2 +- .../working_phi_expansion.ll | 4 +- .../working_phi_two_scalars.ll | 4 +- .../working_value_expansion.ll | 2 +- .../prune_only_scalardeps.ll | 2 +- .../2012-03-16-Empty-Domain.ll | 2 +- .../2013-04-11-Empty-Domain-two.ll | 2 +- .../GreedyFuse/fuse-double.ll | 4 +- .../GreedyFuse/fuse-except-first.ll | 4 +- .../GreedyFuse/fuse-except-third.ll | 4 +- .../GreedyFuse/fuse-inner-carried.ll | 4 +- .../GreedyFuse/fuse-inner-third.ll | 4 +- .../GreedyFuse/fuse-inner.ll | 4 +- .../GreedyFuse/fuse-simple.ll | 4 +- .../GreedyFuse/nofuse-simple.ll | 4 +- .../GreedyFuse/nofuse-with-middle.ll | 4 +- .../ManualOptimization/disable_nonforced.ll | 2 +- .../distribute_heuristic.ll | 4 +- .../distribute_illegal_looploc.ll | 2 +- .../distribute_illegal_pragmaloc.ll | 2 +- .../ManualOptimization/unroll_disable.ll | 2 +- .../ManualOptimization/unroll_double.ll | 2 +- .../ManualOptimization/unroll_full.ll | 2 +- .../ManualOptimization/unroll_heuristic.ll | 4 +- .../ManualOptimization/unroll_partial.ll | 4 +- .../unroll_partial_followup.ll | 8 +- .../ScheduleOptimizer/SIMDInParallelFor.ll | 2 +- polly/test/ScheduleOptimizer/computeout.ll | 4 +- .../ensure-correct-tile-sizes.ll | 7 +- .../focaltech_test_detail_threshold-7bc17e.ll | 2 +- .../full_partial_tile_separation.ll | 2 +- polly/test/ScheduleOptimizer/line-tiling-2.ll | 2 +- polly/test/ScheduleOptimizer/line-tiling.ll | 2 +- .../mat_mul_pattern_data_layout.ll | 11 +- .../mat_mul_pattern_data_layout_2.ll | 10 +- .../ScheduleOptimizer/one-dimensional-band.ll | 2 +- .../ScheduleOptimizer/outer_coincidence.ll | 4 +- ...attern-matching-based-opts-after-delicm.ll | 6 +- ...tern-matching-based-opts-after-delicm_2.ll | 4 +- .../pattern-matching-based-opts.ll | 9 +- .../pattern-matching-based-opts_11.ll | 14 +- .../pattern-matching-based-opts_12.ll | 10 +- .../pattern-matching-based-opts_13.ll | 10 +- .../pattern-matching-based-opts_14.ll | 11 +- .../pattern-matching-based-opts_15.ll | 4 +- .../pattern-matching-based-opts_16.ll | 3 +- .../pattern-matching-based-opts_17.ll | 3 +- .../pattern-matching-based-opts_18.ll | 3 +- .../pattern-matching-based-opts_19.ll | 3 +- .../pattern-matching-based-opts_2.ll | 3 +- .../pattern-matching-based-opts_20.ll | 3 +- .../pattern-matching-based-opts_21.ll | 3 +- .../pattern-matching-based-opts_22.ll | 3 +- .../pattern-matching-based-opts_24.ll | 4 +- .../pattern-matching-based-opts_25.ll | 3 +- .../pattern-matching-based-opts_3.ll | 17 +- .../pattern-matching-based-opts_4.ll | 12 +- .../pattern-matching-based-opts_5.ll | 10 +- .../pattern-matching-based-opts_6.ll | 10 +- .../pattern-matching-based-opts_7.ll | 10 +- .../pattern-matching-based-opts_8.ll | 10 +- .../pattern-matching-based-opts_9.ll | 12 +- .../pattern_matching_based_opts_splitmap.ll | 2 +- .../prevectorization-without-tiling.ll | 2 +- .../ScheduleOptimizer/prevectorization.ll | 4 +- .../prevectorization_islbound.ll | 2 +- .../ScheduleOptimizer/rectangular-tiling.ll | 8 +- .../ScheduleOptimizer/schedule_computeout.ll | 2 +- polly/test/ScheduleOptimizer/statistics.ll | 2 +- .../ScheduleOptimizer/tile_after_fusion.ll | 2 +- ...vivid_vbi_gen_sliced-before-llvmreduced.ll | 2 +- .../aliasing_parametric_simple_1.ll | 2 +- .../aliasing_parametric_simple_2.ll | 2 +- polly/test/ScopDetect/aliasing_simple_1.ll | 2 +- polly/test/ScopDetect/aliasing_simple_2.ll | 2 +- polly/test/ScopDetect/base_pointer.ll | 2 +- .../base_pointer_load_setNewAccessRelation.ll | 2 +- .../base_pointer_setNewAccessRelation.ll | 4 +- polly/test/ScopDetect/callbr.ll | 4 +- .../ScopDetect/collective_invariant_loads.ll | 2 +- .../ScopDetect/cross_loop_non_single_exit.ll | 2 +- .../cross_loop_non_single_exit_2.ll | 2 +- ...ependency_to_phi_node_outside_of_region.ll | 2 +- .../test/ScopDetect/detect-full-functions.ll | 2 +- polly/test/ScopDetect/dom-tree-crash.ll | 2 +- polly/test/ScopDetect/dot-scops-npm.ll | 2 +- polly/test/ScopDetect/dot-scops.ll | 2 +- .../ScopDetect/error-block-always-executed.ll | 2 +- .../error-block-referenced-from-scop.ll | 2 +- .../ScopDetect/error-block-unreachable.ll | 2 +- .../ScopDetect/expand-region-correctly-2.ll | 2 +- .../ScopDetect/expand-region-correctly.ll | 2 +- .../test/ScopDetect/ignore_func_flag_regex.ll | 2 +- .../index_from_unpredictable_loop.ll | 4 +- .../index_from_unpredictable_loop2.ll | 4 +- polly/test/ScopDetect/indvars.ll | 2 +- polly/test/ScopDetect/intrinsics_1.ll | 2 +- polly/test/ScopDetect/intrinsics_2.ll | 2 +- polly/test/ScopDetect/intrinsics_3.ll | 2 +- .../ScopDetect/invalid-latch-conditions.ll | 6 +- .../ScopDetect/invalidate_scalar_evolution.ll | 2 +- .../ScopDetect/invariant-load-before-scop.ll | 2 +- polly/test/ScopDetect/keep_going_expansion.ll | 2 +- polly/test/ScopDetect/mod_ref_read_pointer.ll | 4 +- polly/test/ScopDetect/more-than-one-loop.ll | 4 +- .../ScopDetect/multidim-with-undef-size.ll | 2 +- polly/test/ScopDetect/multidim.ll | 2 +- .../ScopDetect/multidim_indirect_access.ll | 2 +- ..._two_accesses_different_delinearization.ll | 2 +- .../ScopDetect/nested_loop_single_exit.ll | 4 +- .../test/ScopDetect/non-affine-conditional.ll | 2 +- .../ScopDetect/non-affine-float-compare.ll | 2 +- ...-affine-loop-condition-dependent-access.ll | 8 +- ...ffine-loop-condition-dependent-access_2.ll | 6 +- ...ffine-loop-condition-dependent-access_3.ll | 6 +- polly/test/ScopDetect/non-affine-loop.ll | 10 +- .../non-beneficial-loops-small-trip-count.ll | 2 +- .../non-constant-add-rec-start-expr.ll | 2 +- .../ScopDetect/non-simple-memory-accesses.ll | 2 +- .../ScopDetect/non_affine_loop_condition.ll | 4 +- polly/test/ScopDetect/only-one-affine-loop.ll | 2 +- polly/test/ScopDetect/only_func_flag.ll | 2 +- polly/test/ScopDetect/only_func_flag_regex.ll | 2 +- .../parametric-multiply-in-scev-2.ll | 2 +- .../ScopDetect/parametric-multiply-in-scev.ll | 2 +- .../phi_with_multi_exiting_edges.ll | 2 +- .../profitability-large-basic-blocks.ll | 10 +- .../profitability-two-nested-loops.ll | 2 +- polly/test/ScopDetect/remove_all_children.ll | 2 +- polly/test/ScopDetect/report-scop-location.ll | 2 +- .../restrict-undef-size-scopdetect.ll | 2 +- polly/test/ScopDetect/run_time_alias_check.ll | 2 +- polly/test/ScopDetect/scev_remove_max.ll | 2 +- polly/test/ScopDetect/sequential_loops.ll | 2 +- polly/test/ScopDetect/simple_loop.ll | 2 +- .../simple_loop_non_single_entry.ll | 2 +- .../ScopDetect/simple_loop_non_single_exit.ll | 2 +- .../simple_loop_non_single_exit_2.ll | 2 +- .../ScopDetect/simple_loop_two_phi_nodes.ll | 2 +- .../test/ScopDetect/simple_loop_with_param.ll | 2 +- .../ScopDetect/simple_loop_with_param_2.ll | 2 +- .../ScopDetect/simple_non_single_entry.ll | 2 +- .../ScopDetect/skip_function_attribute.ll | 2 +- .../srem_with_parametric_divisor.ll | 2 +- polly/test/ScopDetect/statistics.ll | 2 +- polly/test/ScopDetect/switch-in-loop-patch.ll | 2 +- .../test/ScopDetect/tlr_is_hoistable_load.ll | 2 +- .../ReportAlias-01.ll | 2 +- .../ScopDetectionDiagnostics/ReportEntry.ll | 2 +- .../ReportFuncCall-01.ll | 2 +- .../ReportIrreducibleRegion.ll | 2 +- .../ReportIrreducibleRegionWithoutDebugLoc.ll | 2 +- .../ReportLoopBound-01.ll | 16 +- .../ReportLoopHasNoExit.ll | 4 +- .../ReportMultipleNonAffineAccesses.ll | 12 +- .../ReportNonAffineAccess-01.ll | 2 +- .../ReportUnprofitable.ll | 8 +- .../ReportUnreachableInExit.ll | 3 +- .../ReportVariantBasePtr-01.ll | 2 +- .../loop_has_multiple_exits.ll | 2 +- .../loop_partially_in_scop-2.ll | 2 +- .../loop_partially_in_scop.ll | 2 +- .../ScopInfo/20110312-Fail-without-basicaa.ll | 2 +- .../20111108-Parameter-not-detected.ll | 2 +- ...03-16-Crash-because-of-unsigned-in-scev.ll | 2 +- .../2015-10-04-Crash-in-domain-generation.ll | 2 +- polly/test/ScopInfo/Alias-0.ll | 4 +- polly/test/ScopInfo/Alias-1.ll | 4 +- polly/test/ScopInfo/Alias-2.ll | 4 +- polly/test/ScopInfo/Alias-3.ll | 4 +- polly/test/ScopInfo/Alias-4.ll | 4 +- .../test/ScopInfo/BoundChecks/single-loop.ll | 4 +- polly/test/ScopInfo/BoundChecks/two-loops.ll | 4 +- polly/test/ScopInfo/NonAffine/div_backedge.ll | 2 +- polly/test/ScopInfo/NonAffine/div_domain.ll | 2 +- ...nt_loads_dependent_in_non_affine_region.ll | 2 +- .../ScopInfo/NonAffine/modulo_backedge.ll | 2 +- .../test/ScopInfo/NonAffine/modulo_domain.ll | 2 +- ...ffine-loop-condition-dependent-access_1.ll | 4 +- ...ffine-loop-condition-dependent-access_2.ll | 6 +- ...ffine-loop-condition-dependent-access_3.ll | 6 +- .../non_affine_access_with_range_2.ll | 2 +- .../ScopInfo/NonAffine/non_affine_but_sdiv.ll | 2 +- .../ScopInfo/NonAffine/non_affine_but_srem.ll | 2 +- .../non_affine_conditional_nested.ll | 2 +- ...ine_conditional_surrounding_affine_loop.ll | 11 +- ...conditional_surrounding_non_affine_loop.ll | 16 +- .../NonAffine/non_affine_float_compare.ll | 2 +- .../NonAffine/non_affine_loop_condition.ll | 6 +- .../NonAffine/non_affine_loop_used_later.ll | 4 +- .../NonAffine/non_affine_parametric_loop.ll | 2 +- .../non_affine_region_guaranteed_non-entry.ll | 2 +- ...whole-scop-non-affine-subregion-in-loop.ll | 2 +- .../aliasing_conditional_alias_groups_1.ll | 2 +- .../aliasing_conditional_alias_groups_2.ll | 2 +- polly/test/ScopInfo/aliasing_dead_access.ll | 2 +- .../aliasing_many_arrays_to_compare.ll | 7 +- ...iasing_many_parameters_not_all_involved.ll | 4 +- .../aliasing_many_read_only_acesses.ll | 2 +- .../aliasing_multiple_alias_groups.ll | 4 +- .../aliasing_with_non_affine_access.ll | 2 +- .../allow-all-parameters-dereferencable.ll | 11 +- polly/test/ScopInfo/assume_gep_bounds.ll | 4 +- polly/test/ScopInfo/assume_gep_bounds_2.ll | 3 +- polly/test/ScopInfo/assume_gep_bounds_many.ll | 3 +- .../avoid_new_parameters_from_geps.ll | 2 +- polly/test/ScopInfo/bool-addrec.ll | 2 +- .../test/ScopInfo/bounded_loop_assumptions.ll | 2 +- ...ces-loop-scev-with-unknown-iterations-2.ll | 6 +- ...ces-loop-scev-with-unknown-iterations-3.ll | 7 +- ...ences-loop-scev-with-unknown-iterations.ll | 7 +- polly/test/ScopInfo/bug_2010_10_22.ll | 2 +- polly/test/ScopInfo/bug_2011_1_5.ll | 2 +- .../test/ScopInfo/bug_scev_not_fully_eval.ll | 2 +- polly/test/ScopInfo/cfg_consequences.ll | 2 +- .../test/ScopInfo/complex-branch-structure.ll | 3 +- polly/test/ScopInfo/complex-condition.ll | 4 +- polly/test/ScopInfo/complex-expression.ll | 4 +- polly/test/ScopInfo/complex-loop-nesting.ll | 2 +- .../ScopInfo/complex-successor-structure-2.ll | 4 +- .../ScopInfo/complex-successor-structure-3.ll | 3 +- .../ScopInfo/complex-successor-structure.ll | 4 +- .../complex_domain_binary_condition.ll | 3 +- .../ScopInfo/complex_execution_context.ll | 4 +- polly/test/ScopInfo/cond_constant_in_loop.ll | 2 +- polly/test/ScopInfo/cond_in_loop.ll | 2 +- .../ScopInfo/condition-after-error-block-2.ll | 2 +- ...condition-after-error-block-before-scop.ll | 2 +- .../ScopInfo/condtion-after-error-block.ll | 2 +- polly/test/ScopInfo/const_srem_sdiv.ll | 3 +- .../constant-non-integer-branch-condition.ll | 2 +- .../ScopInfo/constant_factor_in_parameter.ll | 4 +- ...stant_functions_outside_scop_as_unknown.ll | 2 +- polly/test/ScopInfo/constant_start_integer.ll | 2 +- polly/test/ScopInfo/debug_call.ll | 2 +- .../delinearize-together-all-data-refs.ll | 2 +- polly/test/ScopInfo/div_by_zero.ll | 2 +- .../do-not-model-error-block-accesses.ll | 2 +- .../eager-binary-and-or-conditions.ll | 4 +- .../early_exit_for_complex_domains.ll | 2 +- polly/test/ScopInfo/error-blocks-1.ll | 2 +- polly/test/ScopInfo/error-blocks-2.ll | 3 +- polly/test/ScopInfo/error-blocks-3.ll | 2 +- polly/test/ScopInfo/escaping_empty_scop.ll | 2 +- polly/test/ScopInfo/exit-phi-1.ll | 4 +- polly/test/ScopInfo/exit-phi-2.ll | 2 +- polly/test/ScopInfo/exit_phi_accesses-2.ll | 2 +- polly/test/ScopInfo/exit_phi_accesses.ll | 2 +- .../ScopInfo/expensive-boundary-context.ll | 3 +- ...onstant_factor_introduces_new_parameter.ll | 4 +- polly/test/ScopInfo/full-function.ll | 6 +- polly/test/ScopInfo/granularity_same_name.ll | 8 +- .../test/ScopInfo/granularity_scalar-indep.ll | 2 +- ...ity_scalar-indep_cross-referencing-phi1.ll | 2 +- ...ity_scalar-indep_cross-referencing-phi2.ll | 2 +- .../granularity_scalar-indep_epilogue.ll | 2 +- .../granularity_scalar-indep_epilogue_last.ll | 2 +- .../granularity_scalar-indep_noepilogue.ll | 2 +- .../granularity_scalar-indep_ordered-2.ll | 2 +- .../granularity_scalar-indep_ordered.ll | 2 +- polly/test/ScopInfo/i1_params.ll | 2 +- polly/test/ScopInfo/infeasible-rtc.ll | 6 +- .../ScopInfo/infeasible_invalid_context.ll | 6 +- polly/test/ScopInfo/int2ptr_ptr2int.ll | 4 +- polly/test/ScopInfo/int2ptr_ptr2int_2.ll | 6 +- polly/test/ScopInfo/integers.ll | 2 +- .../ScopInfo/inter-error-bb-dependence.ll | 2 +- polly/test/ScopInfo/inter_bb_scalar_dep.ll | 3 +- .../intra-non-affine-stmt-phi-node.ll | 3 +- .../ScopInfo/intra_and_inter_bb_scalar_dep.ll | 3 +- polly/test/ScopInfo/intra_bb_scalar_dep.ll | 3 +- polly/test/ScopInfo/intrinsics.ll | 2 +- ..._add_rec_after_invariant_load_remapping.ll | 2 +- .../invalidate_iterator_during_MA_removal.ll | 2 +- .../test/ScopInfo/invariant-load-instlist.ll | 2 +- ...ariant-loads-leave-read-only-statements.ll | 4 +- polly/test/ScopInfo/invariant_load.ll | 2 +- ...load_access_classes_different_base_type.ll | 4 +- ...ss_classes_different_base_type_escaping.ll | 4 +- ...lasses_different_base_type_same_pointer.ll | 4 +- ...fferent_base_type_same_pointer_escaping.ll | 4 +- .../ScopInfo/invariant_load_addrec_sum.ll | 2 +- .../ScopInfo/invariant_load_base_pointer.ll | 2 +- ...invariant_load_base_pointer_conditional.ll | 2 +- ...ariant_load_base_pointer_in_conditional.ll | 2 +- .../invariant_load_branch_condition.ll | 3 +- ...ariant_load_canonicalize_array_baseptrs.ll | 4 +- ...iant_load_canonicalize_array_baseptrs_2.ll | 4 +- ...iant_load_canonicalize_array_baseptrs_3.ll | 4 +- ...iant_load_canonicalize_array_baseptrs_4.ll | 4 +- ...ant_load_canonicalize_array_baseptrs_4b.ll | 4 +- ...ant_load_canonicalize_array_baseptrs_4c.ll | 4 +- ...iant_load_canonicalize_array_baseptrs_5.ll | 4 +- .../invariant_load_complex_condition.ll | 3 +- .../test/ScopInfo/invariant_load_condition.ll | 2 +- .../invariant_load_dereferenceable.ll | 4 +- ...iant_load_distinct_parameter_valuations.ll | 2 +- .../ScopInfo/invariant_load_in_non_affine.ll | 3 +- polly/test/ScopInfo/invariant_load_loop_ub.ll | 4 +- .../invariant_load_ptr_ptr_noalias.ll | 3 +- .../ScopInfo/invariant_load_scalar_dep.ll | 2 +- .../ScopInfo/invariant_load_stmt_domain.ll | 2 +- .../invariant_load_zext_parameter-2.ll | 4 +- .../ScopInfo/invariant_load_zext_parameter.ll | 4 +- ...load_zextended_in_own_execution_context.ll | 4 +- ...invariant_loads_complicated_dependences.ll | 2 +- .../invariant_loads_cyclic_dependences.ll | 2 +- polly/test/ScopInfo/invariant_loop_bounds.ll | 2 +- ...ariant_same_loop_bound_multiple_times-1.ll | 2 +- ...ariant_same_loop_bound_multiple_times-2.ll | 2 +- polly/test/ScopInfo/isl_aff_out_of_bounds.ll | 2 +- polly/test/ScopInfo/isl_trip_count_01.ll | 2 +- polly/test/ScopInfo/isl_trip_count_02.ll | 2 +- polly/test/ScopInfo/isl_trip_count_03.ll | 2 +- .../isl_trip_count_multiple_exiting_blocks.ll | 2 +- polly/test/ScopInfo/licm_load.ll | 31 +- polly/test/ScopInfo/licm_potential_store.ll | 79 +--- .../ScopInfo/licm_potential_store_mssa.ll | 50 +++ polly/test/ScopInfo/licm_reduction_nested.ll | 4 +- .../long-compile-time-alias-analysis.ll | 2 +- .../long-sequence-of-error-blocks-2.ll | 2 +- .../ScopInfo/long-sequence-of-error-blocks.ll | 3 +- .../test/ScopInfo/loop-multiexit-succ-cond.ll | 4 +- polly/test/ScopInfo/loop_affine_bound_0.ll | 4 +- polly/test/ScopInfo/loop_affine_bound_1.ll | 4 +- polly/test/ScopInfo/loop_affine_bound_2.ll | 4 +- polly/test/ScopInfo/loop_carry.ll | 2 +- .../test/ScopInfo/many-scalar-dependences.ll | 2 +- polly/test/ScopInfo/max-loop-depth.ll | 2 +- polly/test/ScopInfo/memcpy-raw-source.ll | 2 +- polly/test/ScopInfo/memcpy.ll | 4 +- polly/test/ScopInfo/memmove.ll | 4 +- polly/test/ScopInfo/memset.ll | 4 +- polly/test/ScopInfo/memset_null.ll | 4 +- .../ScopInfo/mismatching-array-dimensions.ll | 2 +- .../mod_ref_access_pointee_arguments.ll | 6 +- .../mod_ref_read_pointee_arguments.ll | 6 +- polly/test/ScopInfo/mod_ref_read_pointer.ll | 4 +- polly/test/ScopInfo/mod_ref_read_pointers.ll | 6 +- polly/test/ScopInfo/modulo_zext_1.ll | 2 +- polly/test/ScopInfo/modulo_zext_2.ll | 2 +- polly/test/ScopInfo/modulo_zext_3.ll | 2 +- polly/test/ScopInfo/multi-scop.ll | 2 +- .../ScopInfo/multidim_2d-diagonal-matrix.ll | 4 +- .../multidim_2d_outer_parametric_offset.ll | 2 +- ..._2d_parametric_array_static_loop_bounds.ll | 2 +- .../ScopInfo/multidim_2d_with_modref_call.ll | 8 +- .../multidim_2d_with_modref_call_2.ll | 8 +- ..._3d_parametric_array_static_loop_bounds.ll | 2 +- ...idim_fixedsize_different_dimensionality.ll | 2 +- .../multidim_fixedsize_multi_offset.ll | 2 +- .../ScopInfo/multidim_fold_constant_dim.ll | 2 +- .../multidim_fold_constant_dim_zero.ll | 2 +- polly/test/ScopInfo/multidim_fortran_2d.ll | 3 +- .../ScopInfo/multidim_fortran_2d_params.ll | 4 +- .../multidim_fortran_2d_with_modref_call.ll | 8 +- polly/test/ScopInfo/multidim_fortran_srem.ll | 2 +- .../test/ScopInfo/multidim_gep_pointercast.ll | 2 +- .../ScopInfo/multidim_gep_pointercast2.ll | 2 +- .../ScopInfo/multidim_invalid_dimension.ll | 2 +- .../multidim_ivs_and_integer_offsets_3d.ll | 2 +- ...multidim_ivs_and_parameteric_offsets_3d.ll | 2 +- .../test/ScopInfo/multidim_many_references.ll | 4 +- .../ScopInfo/multidim_nested_start_integer.ll | 4 +- .../multidim_nested_start_share_parameter.ll | 2 +- polly/test/ScopInfo/multidim_only_ivs_2d.ll | 2 +- polly/test/ScopInfo/multidim_only_ivs_3d.ll | 2 +- .../ScopInfo/multidim_only_ivs_3d_cast.ll | 2 +- .../ScopInfo/multidim_only_ivs_3d_reverse.ll | 2 +- .../ScopInfo/multidim_param_in_subscript-2.ll | 2 +- .../ScopInfo/multidim_param_in_subscript.ll | 2 +- .../multidim_parameter_addrec_product.ll | 2 +- .../multidim_single_and_multidim_array.ll | 16 +- polly/test/ScopInfo/multidim_srem.ll | 2 +- polly/test/ScopInfo/multidim_with_bitcast.ll | 2 +- .../ScopInfo/multiple-binary-or-conditions.ll | 4 +- ...ss-offset-not-dividable-by-element-size.ll | 4 +- .../ScopInfo/multiple-types-non-affine-2.ll | 4 +- .../ScopInfo/multiple-types-non-affine.ll | 4 +- .../multiple-types-non-power-of-two-2.ll | 2 +- .../multiple-types-non-power-of-two.ll | 2 +- .../multiple-types-two-dimensional-2.ll | 4 +- .../multiple-types-two-dimensional.ll | 4 +- polly/test/ScopInfo/multiple-types.ll | 3 +- .../test/ScopInfo/multiple_exiting_blocks.ll | 2 +- .../multiple_exiting_blocks_two_loop.ll | 2 +- polly/test/ScopInfo/multiple_latch_blocks.ll | 2 +- polly/test/ScopInfo/nested-loops.ll | 2 +- .../no-scalar-deps-in-non-affine-subregion.ll | 2 +- polly/test/ScopInfo/non-affine-region-phi.ll | 4 +- .../ScopInfo/non-affine-region-with-loop-2.ll | 2 +- .../ScopInfo/non-affine-region-with-loop.ll | 4 +- polly/test/ScopInfo/non-precise-inv-load-1.ll | 2 +- polly/test/ScopInfo/non-precise-inv-load-2.ll | 2 +- polly/test/ScopInfo/non-precise-inv-load-3.ll | 2 +- polly/test/ScopInfo/non-precise-inv-load-4.ll | 2 +- polly/test/ScopInfo/non-precise-inv-load-5.ll | 2 +- polly/test/ScopInfo/non-precise-inv-load-6.ll | 2 +- polly/test/ScopInfo/non-pure-function-call.ll | 2 +- ...-pure-function-calls-causes-dead-blocks.ll | 2 +- .../test/ScopInfo/non-pure-function-calls.ll | 2 +- polly/test/ScopInfo/non_affine_access.ll | 4 +- polly/test/ScopInfo/non_affine_region_1.ll | 2 +- polly/test/ScopInfo/non_affine_region_2.ll | 2 +- polly/test/ScopInfo/non_affine_region_3.ll | 4 +- polly/test/ScopInfo/non_affine_region_4.ll | 2 +- .../ScopInfo/nonaffine-buildMemoryAccess.ll | 2 +- polly/test/ScopInfo/not-a-reduction.ll | 2 +- polly/test/ScopInfo/opaque-struct.ll | 2 +- ...gion-entry-phi-node-nonaffine-subregion.ll | 2 +- ...ut-of-scop-use-in-region-entry-phi-node.ll | 2 +- .../ScopInfo/parameter-constant-division.ll | 4 +- .../ScopInfo/parameter_in_dead_statement.ll | 6 +- polly/test/ScopInfo/parameter_product.ll | 2 +- .../parameter_with_constant_factor_in_add.ll | 2 +- .../ScopInfo/partially_invariant_load_1.ll | 4 +- .../ScopInfo/partially_invariant_load_2.ll | 2 +- .../test/ScopInfo/phi-in-non-affine-region.ll | 2 +- polly/test/ScopInfo/phi_after_error_block.ll | 2 +- .../test/ScopInfo/phi_condition_modeling_1.ll | 2 +- .../test/ScopInfo/phi_condition_modeling_2.ll | 2 +- .../test/ScopInfo/phi_conditional_simple_1.ll | 2 +- polly/test/ScopInfo/phi_loop_carried_float.ll | 2 +- polly/test/ScopInfo/phi_not_grouped_at_top.ll | 2 +- polly/test/ScopInfo/phi_scalar_simple_1.ll | 2 +- polly/test/ScopInfo/phi_scalar_simple_2.ll | 2 +- polly/test/ScopInfo/phi_with_invoke_edge.ll | 2 +- .../ScopInfo/pointer-comparison-no-nsw.ll | 2 +- polly/test/ScopInfo/pointer-comparison.ll | 2 +- .../test/ScopInfo/pointer-type-expressions.ll | 2 +- ...er-used-as-base-pointer-and-scalar-read.ll | 2 +- .../polly-timeout-parameter-bounds.ll | 2 +- polly/test/ScopInfo/pr38218.ll | 2 +- ...eserve-equiv-class-order-in-basic_block.ll | 2 +- .../test/ScopInfo/process_added_dimensions.ll | 2 +- .../test/ScopInfo/pwaff-complexity-bailout.ll | 2 +- polly/test/ScopInfo/ranged_parameter.ll | 2 +- polly/test/ScopInfo/ranged_parameter_2.ll | 3 +- polly/test/ScopInfo/ranged_parameter_wrap.ll | 2 +- .../test/ScopInfo/ranged_parameter_wrap_2.ll | 2 +- .../read-only-scalar-used-in-phi-2.ll | 2 +- .../ScopInfo/read-only-scalar-used-in-phi.ll | 2 +- polly/test/ScopInfo/read-only-scalars.ll | 4 +- polly/test/ScopInfo/read-only-statements.ll | 2 +- .../ScopInfo/reduction_alternating_base.ll | 2 +- ...uction_chain_partially_outside_the_scop.ll | 2 +- .../ScopInfo/reduction_different_index.ll | 2 +- .../ScopInfo/reduction_different_index1.ll | 2 +- .../reduction_disabled_multiplicative.ll | 2 +- polly/test/ScopInfo/reduction_double.ll | 2 +- .../reduction_escaping_intermediate.ll | 2 +- .../reduction_escaping_intermediate_2.ll | 2 +- .../reduction_escaping_intermediate_3.ll | 2 +- polly/test/ScopInfo/reduction_if.ll | 2 +- .../ScopInfo/reduction_indirect_access.ll | 2 +- .../ScopInfo/reduction_indirect_access_2.ll | 2 +- .../reduction_invalid_different_operators.ll | 2 +- .../reduction_invalid_overlapping_accesses.ll | 2 +- .../reduction_long_reduction_chain.ll | 2 +- ...duction_long_reduction_chain_double_use.ll | 2 +- .../reduction_multiple_different_operators.ll | 2 +- .../reduction_multiple_loops_array_sum.ll | 2 +- .../reduction_multiple_loops_array_sum_1.ll | 2 +- .../reduction_multiple_simple_binary.ll | 2 +- .../reduction_non_overlapping_chains.ll | 2 +- .../reduction_only_reduction_like_access.ll | 2 +- polly/test/ScopInfo/reduction_simple_fp.ll | 2 +- .../ScopInfo/reduction_simple_w_constant.ll | 2 +- polly/test/ScopInfo/reduction_simple_w_iv.ll | 2 +- .../ScopInfo/reduction_two_identical_reads.ll | 4 +- .../redundant_parameter_constraint.ll | 2 +- .../test/ScopInfo/region-with-instructions.ll | 2 +- polly/test/ScopInfo/remarks.ll | 3 +- .../required-invariant-loop-bounds.ll | 3 +- .../ScopInfo/restriction_in_dead_block.ll | 2 +- .../run-time-check-many-array-disjuncts.ll | 5 +- .../run-time-check-many-parameters.ll | 2 +- .../run-time-check-many-piecewise-aliasing.ll | 5 +- .../run-time-check-read-only-arrays.ll | 2 +- .../same-base-address-scalar-and-array.ll | 2 +- polly/test/ScopInfo/scalar.ll | 2 +- .../ScopInfo/scalar_dependence_cond_br.ll | 2 +- polly/test/ScopInfo/scalar_to_array.ll | 4 +- .../scev-div-with-evaluatable-divisor.ll | 2 +- polly/test/ScopInfo/scev-invalidated.ll | 2 +- .../schedule-const-post-dominator-walk-2.ll | 2 +- .../schedule-const-post-dominator-walk.ll | 2 +- .../schedule-constuction-endless-loop1.ll | 2 +- .../schedule-constuction-endless-loop2.ll | 2 +- ...tly-contructed-in-case-of-infinite-loop.ll | 2 +- .../scop-affine-parameter-ordering.ll | 2 +- polly/test/ScopInfo/sign_wrapped_set.ll | 2 +- polly/test/ScopInfo/simple_loop_1.ll | 2 +- polly/test/ScopInfo/simple_loop_2.ll | 2 +- polly/test/ScopInfo/simple_loop_unsigned.ll | 2 +- polly/test/ScopInfo/simple_loop_unsigned_2.ll | 2 +- polly/test/ScopInfo/simple_loop_unsigned_3.ll | 2 +- .../ScopInfo/simple_nonaffine_loop_not.ll | 2 +- polly/test/ScopInfo/smax.ll | 2 +- polly/test/ScopInfo/statistics.ll | 2 +- .../stmt_split_exit_of_region_stmt.ll | 2 +- .../ScopInfo/stmt_split_no_after_split.ll | 2 +- .../test/ScopInfo/stmt_split_no_dependence.ll | 2 +- polly/test/ScopInfo/stmt_split_on_store.ll | 2 +- .../ScopInfo/stmt_split_on_synthesizable.ll | 2 +- .../stmt_split_phi_in_beginning_bb.ll | 2 +- polly/test/ScopInfo/stmt_split_phi_in_stmt.ll | 2 +- .../ScopInfo/stmt_split_scalar_dependence.ll | 2 +- polly/test/ScopInfo/stmt_split_within_loop.ll | 2 +- .../stmt_with_read_but_without_sideffect.ll | 2 +- polly/test/ScopInfo/switch-1.ll | 4 +- polly/test/ScopInfo/switch-2.ll | 4 +- polly/test/ScopInfo/switch-3.ll | 4 +- polly/test/ScopInfo/switch-4.ll | 4 +- polly/test/ScopInfo/switch-5.ll | 4 +- polly/test/ScopInfo/switch-6.ll | 4 +- polly/test/ScopInfo/switch-7.ll | 4 +- polly/test/ScopInfo/tempscop-printing.ll | 2 +- .../ScopInfo/test-wrapping-in-condition.ll | 4 +- polly/test/ScopInfo/truncate-1.ll | 2 +- polly/test/ScopInfo/truncate-2.ll | 2 +- polly/test/ScopInfo/truncate-3.ll | 3 +- polly/test/ScopInfo/two-loops-one-infinite.ll | 2 +- .../two-loops-right-after-each-other.ll | 2 +- polly/test/ScopInfo/undef_in_cond.ll | 2 +- polly/test/ScopInfo/unnamed_nonaffine.ll | 4 +- polly/test/ScopInfo/unnamed_stmts.ll | 2 +- .../ScopInfo/unpredictable_nonscop_loop.ll | 2 +- .../test/ScopInfo/unprofitable_scalar-accs.ll | 4 +- polly/test/ScopInfo/unsigned-condition.ll | 2 +- polly/test/ScopInfo/unsigned-division-1.ll | 2 +- polly/test/ScopInfo/unsigned-division-2.ll | 2 +- polly/test/ScopInfo/unsigned-division-3.ll | 2 +- polly/test/ScopInfo/unsigned-division-4.ll | 2 +- polly/test/ScopInfo/unsigned-division-5.ll | 2 +- polly/test/ScopInfo/unsigned_wrap_uge.ll | 2 +- polly/test/ScopInfo/unsigned_wrap_ugt.ll | 2 +- polly/test/ScopInfo/unsigned_wrap_ule.ll | 2 +- polly/test/ScopInfo/unsigned_wrap_ult.ll | 2 +- polly/test/ScopInfo/user_context.ll | 8 +- ...ed_assumptions-in-bb-signed-conditional.ll | 4 +- .../user_provided_assumptions-in-bb-signed.ll | 2 +- ...ser_provided_assumptions-in-bb-unsigned.ll | 4 +- .../ScopInfo/user_provided_assumptions.ll | 4 +- .../ScopInfo/user_provided_assumptions_2.ll | 4 +- .../ScopInfo/user_provided_assumptions_3.ll | 4 +- ...ser_provided_non_dominating_assumptions.ll | 6 +- polly/test/ScopInfo/variant_base_pointer.ll | 4 +- .../ScopInfo/variant_load_empty_domain.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_0.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_1.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_2.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_3.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_4.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_5.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_6.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_7.ll | 2 +- .../ScopInfo/wraping_signed_expr_slow_1.ll | 2 +- .../ScopInfo/wraping_signed_expr_slow_2.ll | 2 +- polly/test/ScopInfo/zero_ext_of_truncate.ll | 2 +- polly/test/ScopInfo/zero_ext_of_truncate_2.ll | 2 +- .../test/ScopInfo/zero_ext_space_mismatch.ll | 2 +- polly/test/ScopInliner/ignore-declares.ll | 2 +- polly/test/ScopInliner/invariant-load-func.ll | 2 +- polly/test/ScopInliner/simple-inline-loop.ll | 2 +- polly/test/Simplify/coalesce_3partials.ll | 2 +- .../Simplify/coalesce_disjointelements.ll | 2 +- polly/test/Simplify/coalesce_overlapping.ll | 2 +- polly/test/Simplify/coalesce_partial.ll | 2 +- polly/test/Simplify/dead_access_load.ll | 2 +- polly/test/Simplify/dead_access_phi.ll | 2 +- polly/test/Simplify/dead_access_value.ll | 2 +- polly/test/Simplify/dead_instruction.ll | 2 +- polly/test/Simplify/emptyaccessdomain.ll | 2 +- polly/test/Simplify/exit_phi_accesses-2.ll | 2 +- polly/test/Simplify/func-b320a7.ll | 2 +- polly/test/Simplify/gemm.ll | 2 +- .../Simplify/nocoalesce_differentvalues.ll | 2 +- .../Simplify/nocoalesce_elementmismatch.ll | 2 +- polly/test/Simplify/nocoalesce_readbetween.ll | 2 +- .../test/Simplify/nocoalesce_writebetween.ll | 2 +- polly/test/Simplify/notdead_region_exitphi.ll | 2 +- .../test/Simplify/notdead_region_innerphi.ll | 2 +- .../test/Simplify/notredundant_region_loop.ll | 2 +- .../Simplify/notredundant_region_middle.ll | 2 +- .../notredundant_synthesizable_unknownit.ll | 2 +- ...ut-of-scop-use-in-region-entry-phi-node.ll | 2 +- polly/test/Simplify/overwritten.ll | 2 +- polly/test/Simplify/overwritten_3phi.ll | 2 +- polly/test/Simplify/overwritten_3store.ll | 2 +- .../overwritten_implicit_and_explicit.ll | 2 +- .../test/Simplify/overwritten_loadbetween.ll | 2 +- polly/test/Simplify/overwritten_scalar.ll | 2 +- polly/test/Simplify/pass_existence.ll | 2 +- polly/test/Simplify/phi_in_regionstmt.ll | 2 +- polly/test/Simplify/pr33323.ll | 2 +- polly/test/Simplify/redundant.ll | 2 +- .../test/Simplify/redundant_differentindex.ll | 2 +- polly/test/Simplify/redundant_partialwrite.ll | 2 +- polly/test/Simplify/redundant_region.ll | 2 +- .../test/Simplify/redundant_region_scalar.ll | 2 +- polly/test/Simplify/redundant_scalarwrite.ll | 2 +- polly/test/Simplify/redundant_storebetween.ll | 2 +- polly/test/Simplify/scalability1.ll | 2 +- polly/test/Simplify/scalability2.ll | 2 +- polly/test/Simplify/sweep_mapped_phi.ll | 2 +- polly/test/Simplify/sweep_mapped_value.ll | 2 +- .../Simplify/ununsed_read_in_region_entry.ll | 4 +- polly/test/Support/Plugins.ll | 3 +- polly/test/Support/exportjson.ll | 24 +- polly/test/Support/isl-args.ll | 8 +- polly/test/Support/pipelineposition.ll | 8 +- polly/test/lit.site.cfg.in | 4 - polly/test/polly.ll | 2 +- 1143 files changed, 2700 insertions(+), 4460 deletions(-) delete mode 100644 polly/include/polly/LinkAllPasses.h create mode 100644 polly/include/polly/Pass/PhaseManager.h create mode 100644 polly/include/polly/Pass/PollyFunctionPass.h create mode 100644 polly/include/polly/Pass/PollyModulePass.h create mode 100644 polly/lib/Pass/PhaseManager.cpp create mode 100644 polly/lib/Pass/PollyFunctionPass.cpp create mode 100644 polly/lib/Pass/PollyModulePass.cpp create mode 100644 polly/test/ScopInfo/licm_potential_store_mssa.ll diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst index f5ea47b69cf02..215a802843304 100644 --- a/polly/docs/ReleaseNotes.rst +++ b/polly/docs/ReleaseNotes.rst @@ -13,3 +13,7 @@ In Polly |version| the following important changes have been incorporated. * ScopInliner has been updated for the New Pass Manager. + * Polly now is a monolithic pass split into phases. + + * Polly's support for the legacy pass manager has been removed. + diff --git a/polly/include/polly/Canonicalization.h b/polly/include/polly/Canonicalization.h index 03f277e4e91ba..972b660894a1c 100644 --- a/polly/include/polly/Canonicalization.h +++ b/polly/include/polly/Canonicalization.h @@ -11,12 +11,6 @@ #include "llvm/Passes/PassBuilder.h" -namespace llvm { -namespace legacy { -class PassManagerBase; -} -} // namespace llvm - namespace polly { /// Schedule a set of canonicalization passes to prepare for Polly. @@ -26,8 +20,6 @@ namespace polly { /// into a canonical form that simplifies the analysis and optimization passes /// of Polly. The set of optimization passes scheduled here is probably not yet /// optimal. TODO: Optimize the set of canonicalization passes. -void registerCanonicalicationPasses(llvm::legacy::PassManagerBase &PM); - llvm::FunctionPassManager buildCanonicalicationPassesForNPM(llvm::ModulePassManager &MPM, llvm::OptimizationLevel Level); diff --git a/polly/include/polly/CodeGen/CodeGeneration.h b/polly/include/polly/CodeGen/CodeGeneration.h index 57aec1d70cc72..2340fbe016b49 100644 --- a/polly/include/polly/CodeGen/CodeGeneration.h +++ b/polly/include/polly/CodeGen/CodeGeneration.h @@ -14,6 +14,7 @@ #include "llvm/IR/PassManager.h" namespace polly { +class IslAstInfo; enum VectorizerChoice { VECTORIZER_NONE, @@ -33,6 +34,8 @@ struct CodeGenerationPass final : PassInfoMixin<CodeGenerationPass> { }; extern bool PerfMonitoring; + +bool runCodeGeneration(Scop &S, llvm::RegionInfo &RI, IslAstInfo &AI); } // namespace polly #endif // POLLY_CODEGENERATION_H diff --git a/polly/include/polly/CodeGen/IslAst.h b/polly/include/polly/CodeGen/IslAst.h index c99a4957d6b48..3e1ff2c8a24da 100644 --- a/polly/include/polly/CodeGen/IslAst.h +++ b/polly/include/polly/CodeGen/IslAst.h @@ -21,6 +21,7 @@ #ifndef POLLY_ISLAST_H #define POLLY_ISLAST_H +#include "polly/DependenceInfo.h" #include "polly/ScopPass.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/PassManager.h" @@ -172,33 +173,6 @@ struct IslAstAnalysis : AnalysisInfoMixin<IslAstAnalysis> { ScopStandardAnalysisResults &SAR); }; -class IslAstInfoWrapperPass final : public ScopPass { - std::unique_ptr<IslAstInfo> Ast; - -public: - static char ID; - - IslAstInfoWrapperPass() : ScopPass(ID) {} - - IslAstInfo &getAI() { return *Ast; } - const IslAstInfo &getAI() const { return *Ast; } - - /// Build the AST for the given SCoP @p S. - bool runOnScop(Scop &S) override; - - /// Register all analyses and transformation required. - void getAnalysisUsage(AnalysisUsage &AU) const override; - - /// Release the internal memory. - void releaseMemory() override; - - /// Print a source code representation of the program. - void printScop(raw_ostream &OS, Scop &S) const override; -}; - -llvm::Pass *createIslAstInfoWrapperPassPass(); -llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS); - struct IslAstPrinterPass final : PassInfoMixin<IslAstPrinterPass> { IslAstPrinterPass(raw_ostream &OS) : OS(OS) {} @@ -207,11 +181,9 @@ struct IslAstPrinterPass final : PassInfoMixin<IslAstPrinterPass> { raw_ostream &OS; }; -} // namespace polly -namespace llvm { -void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &); -void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &); -} // namespace llvm +std::unique_ptr<IslAstInfo> runIslAstGen(Scop &S, + DependenceAnalysis::Result &DA); +} // namespace polly #endif // POLLY_ISLAST_H diff --git a/polly/include/polly/CodePreparation.h b/polly/include/polly/CodePreparation.h index c6bc526db209d..1a15e3d4d5a29 100644 --- a/polly/include/polly/CodePreparation.h +++ b/polly/include/polly/CodePreparation.h @@ -15,6 +15,12 @@ #include "llvm/IR/PassManager.h" +namespace llvm { +class DominatorTree; +class LoopInfo; +class RegionInfo; +} // namespace llvm + namespace polly { struct CodePreparationPass final : llvm::PassInfoMixin<CodePreparationPass> { llvm::PreservedAnalyses run(llvm::Function &F, diff --git a/polly/include/polly/DeLICM.h b/polly/include/polly/DeLICM.h index 0e03c04079480..63fc509e0bd46 100644 --- a/polly/include/polly/DeLICM.h +++ b/polly/include/polly/DeLICM.h @@ -21,15 +21,10 @@ #include "isl/isl-noexceptions.h" namespace llvm { -class PassRegistry; -class Pass; class raw_ostream; } // namespace llvm namespace polly { -/// Create a new DeLICM pass instance. -llvm::Pass *createDeLICMWrapperPass(); -llvm::Pass *createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS); struct DeLICMPass final : llvm::PassInfoMixin<DeLICMPass> { DeLICMPass() {} @@ -59,11 +54,7 @@ bool isConflicting(isl::union_set ExistingOccupied, isl::union_map ProposedWrites, llvm::raw_ostream *OS = nullptr, unsigned Indent = 0); +bool runDeLICM(Scop &S); } // namespace polly -namespace llvm { -void initializeDeLICMWrapperPassPass(llvm::PassRegistry &); -void initializeDeLICMPrinterLegacyPassPass(llvm::PassRegistry &); -} // namespace llvm - #endif /* POLLY_DELICM_H */ diff --git a/polly/include/polly/DeadCodeElimination.h b/polly/include/polly/DeadCodeElimination.h index d416afa030c56..4d8da56c76eec 100644 --- a/polly/include/polly/DeadCodeElimination.h +++ b/polly/include/polly/DeadCodeElimination.h @@ -13,16 +13,10 @@ #ifndef POLLY_DEADCODEELIMINATION_H #define POLLY_DEADCODEELIMINATION_H +#include "polly/DependenceInfo.h" #include "polly/ScopPass.h" -namespace llvm { -class PassRegistry; -class Pass; -class raw_ostream; -} // namespace llvm - namespace polly { -llvm::Pass *createDeadCodeElimWrapperPass(); struct DeadCodeElimPass final : llvm::PassInfoMixin<DeadCodeElimPass> { DeadCodeElimPass() {} @@ -31,10 +25,7 @@ struct DeadCodeElimPass final : llvm::PassInfoMixin<DeadCodeElimPass> { ScopStandardAnalysisResults &SAR, SPMUpdater &U); }; +bool runDeadCodeElim(Scop &S, DependenceAnalysis::Result &DA); } // namespace polly -namespace llvm { -void initializeDeadCodeElimWrapperPassPass(llvm::PassRegistry &); -} // namespace llvm - #endif /* POLLY_DEADCODEELIMINATION_H */ diff --git a/polly/include/polly/DependenceInfo.h b/polly/include/polly/DependenceInfo.h index d562ad80592f2..88ea468dd5473 100644 --- a/polly/include/polly/DependenceInfo.h +++ b/polly/include/polly/DependenceInfo.h @@ -145,7 +145,6 @@ class Dependences final { friend struct DependenceAnalysis; friend struct DependenceInfoPrinterPass; friend class DependenceInfo; - friend class DependenceInfoWrapperPass; /// Destructor that will free internal objects. ~Dependences() { releaseMemory(); } @@ -192,6 +191,8 @@ class Dependences final { const AnalysisLevel Level; }; +extern Dependences::AnalysisLevel OptAnalysisLevel; + struct DependenceAnalysis final : public AnalysisInfoMixin<DependenceAnalysis> { static AnalysisKey Key; struct Result { @@ -232,108 +233,7 @@ struct DependenceInfoPrinterPass final raw_ostream &OS; }; -class DependenceInfo final : public ScopPass { -public: - static char ID; - - /// Construct a new DependenceInfo pass. - DependenceInfo() : ScopPass(ID) {} - - /// Return the dependence information for the current SCoP. - /// - /// @param Level The granularity of dependence analysis result. - /// - /// @return The dependence analysis result - /// - const Dependences &getDependences(Dependences::AnalysisLevel Level); - - /// Recompute dependences from schedule and memory accesses. - const Dependences &recomputeDependences(Dependences::AnalysisLevel Level); - - /// Invalidate the dependence information and recompute it when needed again. - /// May be required when the underlying Scop was changed in a way that would - /// add new dependencies (e.g. between new statement instances insierted into - /// the SCoP) or intentionally breaks existing ones. It is not required when - /// updating the schedule that conforms the existing dependencies. - void abandonDependences(); - - /// Compute the dependence information for the SCoP @p S. - bool runOnScop(Scop &S) override; - - /// Print the dependences for the given SCoP to @p OS. - void printScop(raw_ostream &OS, Scop &) const override; - - /// Release the internal memory. - void releaseMemory() override { - for (auto &d : D) - d.reset(); - } - - /// Register all analyses and transformation required. - void getAnalysisUsage(AnalysisUsage &AU) const override; - -private: - Scop *S; - - /// Dependences struct for the current SCoP. - std::unique_ptr<Dependences> D[Dependences::NumAnalysisLevels]; -}; - -llvm::Pass *createDependenceInfoPass(); -llvm::Pass *createDependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS); - -/// Construct a new DependenceInfoWrapper pass. -class DependenceInfoWrapperPass final : public FunctionPass { -public: - static char ID; - - /// Construct a new DependenceInfoWrapper pass. - DependenceInfoWrapperPass() : FunctionPass(ID) {} - - /// Return the dependence information for the given SCoP. - /// - /// @param S SCoP object. - /// @param Level The granularity of dependence analysis result. - /// - /// @return The dependence analysis result - /// - const Dependences &getDependences(Scop *S, Dependences::AnalysisLevel Level); - - /// Recompute dependences from schedule and memory accesses. - const Dependences &recomputeDependences(Scop *S, - Dependences::AnalysisLevel Level); - - /// Compute the dependence information on-the-fly for the function. - bool runOnFunction(Function &F) override; - - /// Print the dependences for the current function to @p OS. - void print(raw_ostream &OS, const Module *M = nullptr) const override; - - /// Release the internal memory. - void releaseMemory() override { ScopToDepsMap.clear(); } - - /// Register all analyses and transformation required. - void getAnalysisUsage(AnalysisUsage &AU) const override; - -private: - using ScopToDepsMapTy = DenseMap<Scop *, std::unique_ptr<Dependences>>; - - /// Scop to Dependence map for the current function. - ScopToDepsMapTy ScopToDepsMap; -}; - -llvm::Pass *createDependenceInfoWrapperPassPass(); -llvm::Pass * -createDependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS); - +DependenceAnalysis::Result runDependenceAnalysis(Scop &S); } // namespace polly -namespace llvm { -void initializeDependenceInfoPass(llvm::PassRegistry &); -void initializeDependenceInfoPrinterLegacyPassPass(llvm::PassRegistry &); -void initializeDependenceInfoWrapperPassPass(llvm::PassRegistry &); -void initializeDependenceInfoPrinterLegacyFunctionPassPass( - llvm::PassRegistry &); -} // namespace llvm - #endif diff --git a/polly/include/polly/FlattenSchedule.h b/polly/include/polly/FlattenSchedule.h index 3ef3c304243df..154344d2f5c3e 100644 --- a/polly/include/polly/FlattenSchedule.h +++ b/polly/include/polly/FlattenSchedule.h @@ -15,20 +15,10 @@ #ifndef POLLY_FLATTENSCHEDULE_H #define POLLY_FLATTENSCHEDULE_H -namespace llvm { -class PassRegistry; -class Pass; -class raw_ostream; -} // namespace llvm - namespace polly { -llvm::Pass *createFlattenSchedulePass(); -llvm::Pass *createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS); -} // namespace polly +class Scop; -namespace llvm { -void initializeFlattenSchedulePass(llvm::PassRegistry &); -void initializeFlattenSchedulePrinterLegacyPassPass(llvm::PassRegistry &); -} // namespace llvm +void runFlattenSchedulePass(Scop &S); +} // namespace polly #endif /* POLLY_FLATTENSCHEDULE_H */ diff --git a/polly/include/polly/ForwardOpTree.h b/polly/include/polly/ForwardOpTree.h index b5da0f513ab78..8b2ece1f08e15 100644 --- a/polly/include/polly/ForwardOpTree.h +++ b/polly/include/polly/ForwardOpTree.h @@ -15,13 +15,7 @@ #include "polly/ScopPass.h" -namespace llvm { -class PassRegistry; -} // namespace llvm - namespace polly { -llvm::Pass *createForwardOpTreeWrapperPass(); -llvm::Pass *createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS); struct ForwardOpTreePass final : llvm::PassInfoMixin<ForwardOpTreePass> { ForwardOpTreePass() {} @@ -41,11 +35,15 @@ struct ForwardOpTreePrinterPass final llvm::raw_ostream &OS; }; +/// Pass that redirects scalar reads to array elements that are known to contain +/// the same value. +/// +/// This reduces the number of scalar accesses and therefore potentially +/// increases the freedom of the scheduler. In the ideal case, all reads of a +/// scalar definition are redirected (We currently do not care about removing +/// the write in this case). This is also useful for the main DeLICM pass as +/// there are less scalars to be mapped. +bool runForwardOpTree(Scop &S); } // namespace polly -namespace llvm { -void initializeForwardOpTreeWrapperPassPass(PassRegistry &); -void initializeForwardOpTreePrinterLegacyPassPass(PassRegistry &); -} // namespace llvm - #endif // POLLY_FORWARDOPTREE_H diff --git a/polly/include/polly/JSONExporter.h b/polly/include/polly/JSONExporter.h index 958f95ea11404..82a881c737064 100644 --- a/polly/include/polly/JSONExporter.h +++ b/polly/include/polly/JSONExporter.h @@ -9,13 +9,11 @@ #ifndef POLLY_JSONEXPORTER_H #define POLLY_JSONEXPORTER_H +#include "polly/DependenceInfo.h" #include "polly/ScopPass.h" #include "llvm/IR/PassManager.h" namespace polly { -llvm::Pass *createJSONExporterPass(); -llvm::Pass *createJSONImporterPass(); -llvm::Pass *createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS); /// This pass exports a scop to a jscop file. The filename is generated from the /// concatenation of the function and scop name. @@ -30,12 +28,9 @@ struct JSONImportPass final : llvm::PassInfoMixin<JSONExportPass> { llvm::PreservedAnalyses run(Scop &, ScopAnalysisManager &, ScopStandardAnalysisResults &, SPMUpdater &); }; -} // namespace polly -namespace llvm { -void initializeJSONExporterPass(llvm::PassRegistry &); -void initializeJSONImporterPass(llvm::PassRegistry &); -void initializeJSONImporterPrinterLegacyPassPass(llvm::PassRegistry &); -} // namespace llvm +void runImportJSON(Scop &S, DependenceAnalysis::Result &DA); +void runExportJSON(Scop &S); +} // namespace polly #endif /* POLLY_JSONEXPORTER_H */ diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h deleted file mode 100644 index 9978344c73e9f..0000000000000 --- a/polly/include/polly/LinkAllPasses.h +++ /dev/null @@ -1,156 +0,0 @@ -//===- polly/LinkAllPasses.h ----------- Reference All Passes ---*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This header file pulls in all transformation and analysis passes for tools -// like opt and bugpoint that need this functionality. -// -//===----------------------------------------------------------------------===// - -#ifndef POLLY_LINKALLPASSES_H -#define POLLY_LINKALLPASSES_H - -#include "polly/Config/config.h" -#include "polly/Support/DumpFunctionPass.h" -#include "polly/Support/DumpModulePass.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/AlwaysTrue.h" - -namespace llvm { -class Pass; -class PassRegistry; -} // namespace llvm - -namespace polly { -llvm::Pass *createCodePreparationPass(); -llvm::Pass *createScopInlinerPass(); -llvm::Pass *createDeadCodeElimWrapperPass(); -llvm::Pass *createDependenceInfoPass(); -llvm::Pass *createDependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS); -llvm::Pass *createDependenceInfoWrapperPassPass(); -llvm::Pass * -createDependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS); -llvm::Pass *createDOTOnlyPrinterWrapperPass(); -llvm::Pass *createDOTOnlyViewerWrapperPass(); -llvm::Pass *createDOTPrinterWrapperPass(); -llvm::Pass *createDOTViewerWrapperPass(); -llvm::Pass *createJSONExporterPass(); -llvm::Pass *createJSONImporterPass(); -llvm::Pass *createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS); -llvm::Pass *createPollyCanonicalizePass(); -llvm::Pass *createScopDetectionWrapperPassPass(); -llvm::Pass *createScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS); -llvm::Pass *createScopInfoRegionPassPass(); -llvm::Pass *createScopInfoPrinterLegacyRegionPass(llvm::raw_ostream &OS); -llvm::Pass *createScopInfoWrapperPassPass(); -llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS); -llvm::Pass *createIslAstInfoWrapperPassPass(); -llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS); -llvm::Pass *createCodeGenerationPass(); -llvm::Pass *createIslScheduleOptimizerWrapperPass(); -llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS); -llvm::Pass *createFlattenSchedulePass(); -llvm::Pass *createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS); -llvm::Pass *createForwardOpTreeWrapperPass(); -llvm::Pass *createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS); -llvm::Pass *createDeLICMWrapperPass(); -llvm::Pass *createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS); -llvm::Pass *createMaximalStaticExpansionPass(); -llvm::Pass *createSimplifyWrapperPass(int); -llvm::Pass *createSimplifyPrinterLegacyPass(llvm::raw_ostream &OS); -llvm::Pass *createPruneUnprofitableWrapperPass(); - -extern char &CodePreparationID; -} // namespace polly - -namespace { -struct PollyForcePassLinking { - PollyForcePassLinking() { - // We must reference the passes in such a way that compilers will not delete - // it all as dead code, even with whole program optimization, yet is - // effectively a NO-OP. - if (llvm::getNonFoldableAlwaysTrue()) - return; - - polly::createCodePreparationPass(); - polly::createDeadCodeElimWrapperPass(); - polly::createDependenceInfoPass(); - polly::createDependenceInfoPrinterLegacyPass(llvm::outs()); - polly::createDependenceInfoWrapperPassPass(); - polly::createDependenceInfoPrinterLegacyFunctionPass(llvm::outs()); - polly::createDOTOnlyPrinterWrapperPass(); - polly::createDOTOnlyViewerWrapperPass(); - polly::createDOTPrinterWrapperPass(); - polly::createDOTViewerWrapperPass(); - polly::createJSONExporterPass(); - polly::createJSONImporterPass(); - polly::createJSONImporterPrinterLegacyPass(llvm::outs()); - polly::createScopDetectionWrapperPassPass(); - polly::createScopDetectionPrinterLegacyPass(llvm::outs()); - polly::createScopInfoRegionPassPass(); - polly::createScopInfoPrinterLegacyRegionPass(llvm::outs()); - polly::createScopInfoWrapperPassPass(); - polly::createScopInfoPrinterLegacyFunctionPass(llvm::outs()); - polly::createPollyCanonicalizePass(); - polly::createIslAstInfoWrapperPassPass(); - polly::createIslAstInfoPrinterLegacyPass(llvm::outs()); - polly::createCodeGenerationPass(); - polly::createIslScheduleOptimizerWrapperPass(); - polly::createIslScheduleOptimizerPrinterLegacyPass(llvm::outs()); - polly::createMaximalStaticExpansionPass(); - polly::createFlattenSchedulePass(); - polly::createFlattenSchedulePrinterLegacyPass(llvm::errs()); - polly::createForwardOpTreeWrapperPass(); - polly::createForwardOpTreePrinterLegacyPass(llvm::errs()); - polly::createDeLICMWrapperPass(); - polly::createDeLICMPrinterLegacyPass(llvm::outs()); - polly::createDumpModuleWrapperPass("", true); - polly::createDumpFunctionWrapperPass(""); - polly::createSimplifyWrapperPass(0); - polly::createSimplifyPrinterLegacyPass(llvm::outs()); - polly::createPruneUnprofitableWrapperPass(); - } -} PollyForcePassLinking; // Force link by creating a global definition. -} // namespace - -namespace llvm { -void initializeCodePreparationPass(llvm::PassRegistry &); -void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &); -void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &); -void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &); -void initializeScopInfoRegionPassPass(PassRegistry &); -void initializeScopInfoPrinterLegacyRegionPassPass(llvm::PassRegistry &); -void initializeScopInfoWrapperPassPass(PassRegistry &); -void initializeScopInfoPrinterLegacyFunctionPassPass(PassRegistry &); -void initializeDeadCodeElimWrapperPassPass(llvm::PassRegistry &); -void initializeJSONExporterPass(llvm::PassRegistry &); -void initializeJSONImporterPass(llvm::PassRegistry &); -void initializeJSONImporterPrinterLegacyPassPass(llvm::PassRegistry &); -void initializeDependenceInfoPass(llvm::PassRegistry &); -void initializeDependenceInfoPrinterLegacyPassPass(llvm::PassRegistry &); -void initializeDependenceInfoWrapperPassPass(llvm::PassRegistry &); -void initializeDependenceInfoPrinterLegacyFunctionPassPass( - llvm::PassRegistry &); -void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &); -void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &); -void initializeCodeGenerationPass(llvm::PassRegistry &); -void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &); -void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &); -void initializeMaximalStaticExpanderWrapperPassPass(llvm::PassRegistry &); -void initializePollyCanonicalizePass(llvm::PassRegistry &); -void initializeFlattenSchedulePass(llvm::PassRegistry &); -void initializeFlattenSchedulePrinterLegacyPassPass(llvm::PassRegistry &); -void initializeForwardOpTreeWrapperPassPass(llvm::PassRegistry &); -void initializeForwardOpTreePrinterLegacyPassPass(PassRegistry &); -void initializeDeLICMWrapperPassPass(llvm::PassRegistry &); -void initializeDeLICMPrinterLegacyPassPass(llvm::PassRegistry &); -void initializeSimplifyWrapperPassPass(llvm::PassRegistry &); -void initializeSimplifyPrinterLegacyPassPass(llvm::PassRegistry &); -void initializePruneUnprofitableWrapperPassPass(llvm::PassRegistry &); -} // namespace llvm - -#endif diff --git a/polly/include/polly/MaximalStaticExpansion.h b/polly/include/polly/MaximalStaticExpansion.h index 88827b2700887..1f9fbcb1d6a70 100644 --- a/polly/include/polly/MaximalStaticExpansion.h +++ b/polly/include/polly/MaximalStaticExpansion.h @@ -14,6 +14,7 @@ #ifndef POLLY_MAXIMALSTATICEXPANSION_H #define POLLY_MAXIMALSTATICEXPANSION_H +#include "polly/DependenceInfo.h" #include "polly/ScopPass.h" #include "llvm/IR/PassManager.h" @@ -37,6 +38,7 @@ struct MaximalStaticExpansionPrinterPass llvm::raw_ostream &OS; }; +void runMaximalStaticExpansion(Scop &S, DependenceAnalysis::Result &DI); } // namespace polly #endif /* POLLY_MAXIMALSTATICEXPANSION_H */ diff --git a/polly/include/polly/Pass/PhaseManager.h b/polly/include/polly/Pass/PhaseManager.h new file mode 100644 index 0000000000000..9ff9bbf02d71f --- /dev/null +++ b/polly/include/polly/Pass/PhaseManager.h @@ -0,0 +1,127 @@ +//===------ PhaseManager.h --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implements the sequence of operations on SCoPs, called phases. It is itelf +// not a pass in either pass manager, but used from PollyFunctionPass or +// PollyModulePass. +// +//===----------------------------------------------------------------------===// + +#ifndef POLLY_PASS_PHASEMANAGER_H_ +#define POLLY_PASS_PHASEMANAGER_H_ + +#include "polly/DependenceInfo.h" +#include "llvm/ADT/Bitset.h" +#include <stddef.h> + +namespace llvm { +class Function; +class Error; +} // namespace llvm + +namespace polly { + +/// Phases (in execution order) within the Polly pass. +enum class PassPhase { + None, + + Prepare, + + Detection, + PrintDetect, + DotScops, + DotScopsOnly, + ViewScops, + ViewScopsOnly, + + ScopInfo, + PrintScopInfo, + + Flatten, + + Dependences, + PrintDependences, + + ImportJScop, + Simplify0, + Optree, + DeLICM, + Simplify1, + DeadCodeElimination, + MaximumStaticExtension, + PruneUnprofitable, + Optimization, + ExportJScop, + AstGen, + CodeGen, + + PassPhaseFirst = Prepare, + PassPhaseLast = CodeGen +}; + +StringRef getPhaseName(PassPhase Phase); +PassPhase parsePhase(StringRef Name); +bool dependsOnDependenceInfo(PassPhase Phase); + +/// Options for the Polly pass. +class PollyPassOptions { + /// For each Polly phase, whether it should be executed. + /// Since PassPhase::None is unused, bit positions are shifted by one. + llvm::Bitset<static_cast<size_t>(PassPhase::PassPhaseLast) - + static_cast<size_t>(PassPhase::PassPhaseFirst) + 1> + PhaseEnabled; + +public: + bool ViewAll = false; + std::string ViewFilter; + Dependences::AnalysisLevel PrintDepsAnalysisLevel = Dependences::AL_Statement; + + bool isPhaseEnabled(PassPhase Phase) const { + assert(Phase != PassPhase::None); + unsigned BitPos = static_cast<size_t>(Phase) - + static_cast<size_t>(PassPhase::PassPhaseFirst); + return PhaseEnabled[BitPos]; + } + + void setPhaseEnabled(PassPhase Phase, bool Enabled = true) { + assert(Phase != PassPhase::None); + unsigned BitPos = static_cast<size_t>(Phase) - + static_cast<size_t>(PassPhase::PassPhaseFirst); + if (Enabled) + PhaseEnabled.set(BitPos); + else + PhaseEnabled.reset(BitPos); + } + + /// Enable all phases that are necessary for a roundtrip from LLVM-IR back to + /// LLVM-IR. + void enableEnd2End(); + + /// Enabled the default optimization phases. + void enableDefaultOpts(); + + /// Disable all phases following \p Phase. + /// Useful when regression testing that particular phase and everything after + /// it is not of interest. + void disableAfter(PassPhase Phase); + + /// Check whether the options are coherent relative to each other. + llvm::Error checkConsistency() const; +}; + +/// Run Polly and its phases on \p F. +bool runPollyPass(Function &F, llvm::FunctionAnalysisManager &FAM, + PollyPassOptions Opts); +} // namespace polly + +/// Make llvm::enum_seq<PassPhase> work. +template <> struct llvm::enum_iteration_traits<polly::PassPhase> { + static constexpr bool is_iterable = true; +}; + +#endif /* POLLY_PASS_PHASEMANAGER_H_ */ diff --git a/polly/include/polly/Pass/PollyFunctionPass.h b/polly/include/polly/Pass/PollyFunctionPass.h new file mode 100644 index 0000000000000..dd0d4e77d7a80 --- /dev/null +++ b/polly/include/polly/Pass/PollyFunctionPass.h @@ -0,0 +1,32 @@ +//===------ PollyFunctionPass.h - Polly function pass ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef POLLY_PASS_POLLYFUNCTIONPASS_H_ +#define POLLY_PASS_POLLYFUNCTIONPASS_H_ + +#include "polly/Pass/PhaseManager.h" +#include "llvm/IR/Analysis.h" +#include "llvm/IR/PassManager.h" +#include <utility> + +namespace polly { + +class PollyFunctionPass : public llvm::PassInfoMixin<PollyFunctionPass> { +public: + PollyFunctionPass() {} + PollyFunctionPass(PollyPassOptions Opts) : Opts(std::move(Opts)) {} + + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &); + +private: + PollyPassOptions Opts; +}; +} // namespace polly + +#endif /* POLLY_PASS_POLLYFUNCTIONPASS_H_ */ diff --git a/polly/include/polly/Pass/PollyModulePass.h b/polly/include/polly/Pass/PollyModulePass.h new file mode 100644 index 0000000000000..2214bbf3d143e --- /dev/null +++ b/polly/include/polly/Pass/PollyModulePass.h @@ -0,0 +1,30 @@ +//===------ PollyModulePass.h - Polly module pass -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef POLLY_PASS_POLLYMODULEPASS_H_ +#define POLLY_PASS_POLLYMODULEPASS_H_ + +#include "polly/Pass/PhaseManager.h" +#include "llvm/IR/PassManager.h" + +namespace polly { + +class PollyModulePass : public llvm::PassInfoMixin<PollyModulePass> { +public: + PollyModulePass() {} + PollyModulePass(PollyPassOptions Opts) : Opts(std::move(Opts)) {} + + llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &); + +private: + PollyPassOptions Opts; +}; + +} // namespace polly + +#endif /* POLLY_PASS_POLLYMODULEPASS_H_ */ diff --git a/polly/include/polly/PruneUnprofitable.h b/polly/include/polly/PruneUnprofitable.h index 2d285cce69ad4..16b76cc62f1d2 100644 --- a/polly/include/polly/PruneUnprofitable.h +++ b/polly/include/polly/PruneUnprofitable.h @@ -15,13 +15,7 @@ #include "polly/ScopPass.h" -namespace llvm { -class Pass; -class PassRegistry; -} // namespace llvm - namespace polly { -llvm::Pass *createPruneUnprofitableWrapperPass(); struct PruneUnprofitablePass final : llvm::PassInfoMixin<PruneUnprofitablePass> { @@ -30,10 +24,8 @@ struct PruneUnprofitablePass final llvm::PreservedAnalyses run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &U); }; -} // namespace polly -namespace llvm { -void initializePruneUnprofitableWrapperPassPass(PassRegistry &); -} +bool runPruneUnprofitable(Scop &S); +} // namespace polly #endif // POLLY_PRUNEUNPROFITABLE_H diff --git a/polly/include/polly/RegisterPasses.h b/polly/include/polly/RegisterPasses.h index 3a81e1ba7487d..7819462cb0c36 100644 --- a/polly/include/polly/RegisterPasses.h +++ b/polly/include/polly/RegisterPasses.h @@ -14,7 +14,6 @@ #define POLLY_REGISTER_PASSES_H namespace llvm { -class PassRegistry; class PassBuilder; struct PassPluginLibraryInfo; namespace legacy { @@ -23,7 +22,6 @@ class PassManagerBase; } // namespace llvm namespace polly { -void initializePollyPasses(llvm::PassRegistry &Registry); void registerPollyPasses(llvm::PassBuilder &PB); } // namespace polly diff --git a/polly/include/polly/ScheduleOptimizer.h b/polly/include/polly/ScheduleOptimizer.h index 3e17eeff49ae3..ac45572ba7ed5 100644 --- a/polly/include/polly/ScheduleOptimizer.h +++ b/polly/include/polly/ScheduleOptimizer.h @@ -9,16 +9,10 @@ #ifndef POLLY_SCHEDULEOPTIMIZER_H #define POLLY_SCHEDULEOPTIMIZER_H +#include "polly/DependenceInfo.h" #include "polly/ScopPass.h" -namespace llvm { -class Pass; -class PassRegistry; -} // namespace llvm - namespace polly { -llvm::Pass *createIslScheduleOptimizerWrapperPass(); -llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS); struct IslScheduleOptimizerPass final : llvm::PassInfoMixin<IslScheduleOptimizerPass> { @@ -38,11 +32,9 @@ struct IslScheduleOptimizerPrinterPass final private: llvm::raw_ostream &OS; }; -} // namespace polly -namespace llvm { -void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &); -void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &); -} // namespace llvm +void runIslScheduleOptimizer(Scop &S, llvm::TargetTransformInfo *TTI, + DependenceAnalysis::Result &Deps); +} // namespace polly #endif // POLLY_SCHEDULEOPTIMIZER_H diff --git a/polly/include/polly/ScopDetection.h b/polly/include/polly/ScopDetection.h index 5759f75463284..ded1c88206430 100644 --- a/polly/include/polly/ScopDetection.h +++ b/polly/include/polly/ScopDetection.h @@ -52,7 +52,6 @@ #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Pass.h" #include <set> namespace polly { @@ -68,7 +67,6 @@ using llvm::DenseMap; using llvm::DominatorTree; using llvm::Function; using llvm::FunctionAnalysisManager; -using llvm::FunctionPass; using llvm::IntrinsicInst; using llvm::LoopInfo; using llvm::Module; @@ -631,31 +629,6 @@ struct ScopAnalysisPrinterPass final : PassInfoMixin<ScopAnalysisPrinterPass> { raw_ostream &OS; }; - -class ScopDetectionWrapperPass final : public FunctionPass { - std::unique_ptr<ScopDetection> Result; - -public: - ScopDetectionWrapperPass(); - - /// @name FunctionPass interface - ///@{ - static char ID; - void getAnalysisUsage(AnalysisUsage &AU) const override; - void releaseMemory() override; - bool runOnFunction(Function &F) override; - void print(raw_ostream &OS, const Module *M = nullptr) const override; - ///@} - - ScopDetection &getSD() const { return *Result; } -}; - -llvm::Pass *createScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS); } // namespace polly -namespace llvm { -void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &); -void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &); -} // namespace llvm - #endif // POLLY_SCOPDETECTION_H diff --git a/polly/include/polly/ScopGraphPrinter.h b/polly/include/polly/ScopGraphPrinter.h index b57732ad3d70d..c4e669f0c3503 100644 --- a/polly/include/polly/ScopGraphPrinter.h +++ b/polly/include/polly/ScopGraphPrinter.h @@ -70,6 +70,9 @@ struct DOTGraphTraits<polly::ScopDetection *> : DOTGraphTraits<RegionNode *> { namespace polly { +extern std::string ViewFilter; +extern bool ViewAll; + struct ScopViewer final : llvm::DOTGraphTraitsViewer<ScopAnalysis, false> { ScopViewer() : llvm::DOTGraphTraitsViewer<ScopAnalysis, false>("scops") {} diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h index f700144165d53..7541ddc21e39f 100644 --- a/polly/include/polly/ScopInfo.h +++ b/polly/include/polly/ScopInfo.h @@ -23,13 +23,11 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/RegionPass.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/ValueHandle.h" -#include "llvm/Pass.h" #include "isl/isl-noexceptions.h" #include <cassert> #include <cstddef> @@ -55,8 +53,6 @@ using llvm::MemIntrinsic; using llvm::PassInfoMixin; using llvm::PHINode; using llvm::RegionNode; -using llvm::RegionPass; -using llvm::RGPassManager; using llvm::SetVector; using llvm::SmallPtrSetImpl; using llvm::SmallVector; @@ -2674,39 +2670,6 @@ class Scop final { /// Print Scop scop to raw_ostream OS. raw_ostream &operator<<(raw_ostream &OS, const Scop &scop); -/// The legacy pass manager's analysis pass to compute scop information -/// for a region. -class ScopInfoRegionPass final : public RegionPass { - /// The Scop pointer which is used to construct a Scop. - std::unique_ptr<Scop> S; - -public: - static char ID; // Pass identification, replacement for typeid - - ScopInfoRegionPass() : RegionPass(ID) {} - ~ScopInfoRegionPass() override = default; - - /// Build Scop object, the Polly IR of static control - /// part for the current SESE-Region. - /// - /// @return If the current region is a valid for a static control part, - /// return the Polly IR representing this static control part, - /// return null otherwise. - Scop *getScop() { return S.get(); } - const Scop *getScop() const { return S.get(); } - - /// Calculate the polyhedral scop information for a given Region. - bool runOnRegion(Region *R, RGPassManager &RGM) override; - - void releaseMemory() override { S.reset(); } - - void print(raw_ostream &O, const Module *M = nullptr) const override; - - void getAnalysisUsage(AnalysisUsage &AU) const override; -}; - -llvm::Pass *createScopInfoPrinterLegacyRegionPass(raw_ostream &OS); - class ScopInfo { public: using RegionToScopMapTy = MapVector<Region *, std::unique_ptr<Scop>>; @@ -2781,45 +2744,6 @@ struct ScopInfoPrinterPass final : PassInfoMixin<ScopInfoPrinterPass> { raw_ostream &Stream; }; - -//===----------------------------------------------------------------------===// -/// The legacy pass manager's analysis pass to compute scop information -/// for the whole function. -/// -/// This pass will maintain a map of the maximal region within a scop to its -/// scop object for all the feasible scops present in a function. -/// This pass is an alternative to the ScopInfoRegionPass in order to avoid a -/// region pass manager. -class ScopInfoWrapperPass final : public FunctionPass { - std::unique_ptr<ScopInfo> Result; - -public: - ScopInfoWrapperPass() : FunctionPass(ID) {} - ~ScopInfoWrapperPass() override = default; - - static char ID; // Pass identification, replacement for typeid - - ScopInfo *getSI() { return Result.get(); } - const ScopInfo *getSI() const { return Result.get(); } - - /// Calculate all the polyhedral scops for a given function. - bool runOnFunction(Function &F) override; - - void releaseMemory() override { Result.reset(); } - - void print(raw_ostream &O, const Module *M = nullptr) const override; - - void getAnalysisUsage(AnalysisUsage &AU) const override; -}; - -llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS); } // end namespace polly -namespace llvm { -void initializeScopInfoRegionPassPass(PassRegistry &); -void initializeScopInfoPrinterLegacyRegionPassPass(PassRegistry &); -void initializeScopInfoWrapperPassPass(PassRegistry &); -void initializeScopInfoPrinterLegacyFunctionPassPass(PassRegistry &); -} // end namespace llvm - #endif // POLLY_SCOPINFO_H diff --git a/polly/include/polly/ScopInliner.h b/polly/include/polly/ScopInliner.h index 014667804330f..ae1938f03ac70 100644 --- a/polly/include/polly/ScopInliner.h +++ b/polly/include/polly/ScopInliner.h @@ -23,12 +23,6 @@ class ScopInlinerPass : public llvm::PassInfoMixin<ScopInlinerPass> { llvm::LazyCallGraph &CG, llvm::CGSCCUpdateResult &UR); }; - -llvm::Pass *createScopInlinerWrapperPass(); } // namespace polly -namespace llvm { -void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &); -} - #endif /* POLLY_POLLYINLINER_H */ diff --git a/polly/include/polly/ScopPass.h b/polly/include/polly/ScopPass.h index 144cfd1364393..80ccd5717f96c 100644 --- a/polly/include/polly/ScopPass.h +++ b/polly/include/polly/ScopPass.h @@ -19,7 +19,6 @@ #include "polly/ScopInfo.h" #include "llvm/ADT/PriorityWorklist.h" -#include "llvm/Analysis/RegionPass.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PassManagerImpl.h" @@ -155,33 +154,6 @@ using ScopPassManager = PassManager<Scop, ScopAnalysisManager, ScopStandardAnalysisResults &, SPMUpdater &>; -/// ScopPass - This class adapts the RegionPass interface to allow convenient -/// creation of passes that operate on the Polly IR. Instead of overriding -/// runOnRegion, subclasses override runOnScop. -class ScopPass : public RegionPass { - Scop *S; - -protected: - explicit ScopPass(char &ID) : RegionPass(ID), S(nullptr) {} - - /// runOnScop - This method must be overloaded to perform the - /// desired Polyhedral transformation or analysis. - /// - virtual bool runOnScop(Scop &S) = 0; - - /// Print method for SCoPs. - virtual void printScop(raw_ostream &OS, Scop &S) const {} - - /// getAnalysisUsage - Subclasses that override getAnalysisUsage - /// must call this. - /// - void getAnalysisUsage(AnalysisUsage &AU) const override; - -private: - bool runOnRegion(Region *R, RGPassManager &RGM) override; - void print(raw_ostream &OS, const Module *) const override; -}; - struct ScopStandardAnalysisResults { DominatorTree &DT; ScopInfo &SI; diff --git a/polly/include/polly/Simplify.h b/polly/include/polly/Simplify.h index b2aa58d850fae..4565eb26edaf0 100644 --- a/polly/include/polly/Simplify.h +++ b/polly/include/polly/Simplify.h @@ -16,11 +16,6 @@ #include "polly/ScopPass.h" #include "llvm/ADT/SmallVector.h" -namespace llvm { -class PassRegistry; -class Pass; -} // namespace llvm - namespace polly { class MemoryAccess; class ScopStmt; @@ -41,17 +36,6 @@ class ScopStmt; /// undefined. llvm::SmallVector<MemoryAccess *, 32> getAccessesInOrder(ScopStmt &Stmt); -/// Create a Simplify pass -/// -/// @param CallNo Disambiguates this instance for when there are multiple -/// instances of this pass in the pass manager. It is used only to -/// keep the statistics apart and has no influence on the -/// simplification itself. -/// -/// @return The Simplify pass. -llvm::Pass *createSimplifyWrapperPass(int CallNo = 0); -llvm::Pass *createSimplifyPrinterLegacyPass(llvm::raw_ostream &OS); - struct SimplifyPass final : PassInfoMixin<SimplifyPass> { SimplifyPass(int CallNo = 0) : CallNo(CallNo) {} @@ -73,11 +57,8 @@ struct SimplifyPrinterPass final : PassInfoMixin<SimplifyPrinterPass> { raw_ostream &OS; int CallNo; }; -} // namespace polly -namespace llvm { -void initializeSimplifyWrapperPassPass(llvm::PassRegistry &); -void initializeSimplifyPrinterLegacyPassPass(llvm::PassRegistry &); -} // namespace llvm +bool runSimplify(Scop &S, int CallNo); +} // namespace polly #endif /* POLLY_TRANSFORM_SIMPLIFY_H */ diff --git a/polly/include/polly/Support/DumpFunctionPass.h b/polly/include/polly/Support/DumpFunctionPass.h index e5c16203adb8f..af04912ed4fe2 100644 --- a/polly/include/polly/Support/DumpFunctionPass.h +++ b/polly/include/polly/Support/DumpFunctionPass.h @@ -16,13 +16,7 @@ #include "llvm/IR/PassManager.h" #include <string> -namespace llvm { -class FunctionPass; -class ModulePass; -} // namespace llvm - namespace polly { -llvm::FunctionPass *createDumpFunctionWrapperPass(std::string Suffix); /// A pass that isolates a function into a new Module and writes it into a file. struct DumpFunctionPass final : llvm::PassInfoMixin<DumpFunctionPass> { @@ -33,12 +27,6 @@ struct DumpFunctionPass final : llvm::PassInfoMixin<DumpFunctionPass> { llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &AM); }; - } // namespace polly -namespace llvm { -class PassRegistry; -void initializeDumpFunctionWrapperPassPass(llvm::PassRegistry &); -} // namespace llvm - #endif /* POLLY_SUPPORT_DUMPFUNCTIONPASS_H */ diff --git a/polly/include/polly/Support/DumpModulePass.h b/polly/include/polly/Support/DumpModulePass.h index c90bbc2484310..6d393a174b19b 100644 --- a/polly/include/polly/Support/DumpModulePass.h +++ b/polly/include/polly/Support/DumpModulePass.h @@ -16,12 +16,8 @@ #include "llvm/IR/PassManager.h" #include <string> -namespace llvm { -class ModulePass; -} // namespace llvm - namespace polly { -/// Create a pass that prints the module into a file. +/// A pass that prints the module into a file. /// /// The meaning of @p Filename depends on @p IsSuffix. If IsSuffix==false, then /// the module is written to the @p Filename. If it is true, the filename is @@ -30,10 +26,6 @@ namespace polly { /// The intent of IsSuffix is to avoid the file being overwritten when /// processing multiple modules and/or with multiple dump passes in the /// pipeline. -llvm::ModulePass *createDumpModuleWrapperPass(std::string Filename, - bool IsSuffix); - -/// A pass that prints the module into a file. struct DumpModulePass final : llvm::PassInfoMixin<DumpModulePass> { std::string Filename; bool IsSuffix; @@ -46,9 +38,4 @@ struct DumpModulePass final : llvm::PassInfoMixin<DumpModulePass> { } // namespace polly -namespace llvm { -class PassRegistry; -void initializeDumpModuleWrapperPassPass(llvm::PassRegistry &); -} // namespace llvm - #endif /* POLLY_SUPPORT_DUMPMODULEPASS_H */ diff --git a/polly/include/polly/Support/ScopHelper.h b/polly/include/polly/Support/ScopHelper.h index 75891525ff7b3..38b731a9f7d8d 100644 --- a/polly/include/polly/Support/ScopHelper.h +++ b/polly/include/polly/Support/ScopHelper.h @@ -358,14 +358,6 @@ namespace polly { void simplifyRegion(llvm::Region *R, llvm::DominatorTree *DT, llvm::LoopInfo *LI, llvm::RegionInfo *RI); -/// Split the entry block of a function to store the newly inserted -/// allocations outside of all Scops. -/// -/// @param EntryBlock The entry block of the current function. -/// @param P The pass that currently running. -/// -void splitEntryBlockForAlloca(llvm::BasicBlock *EntryBlock, llvm::Pass *P); - /// Split the entry block of a function to store the newly inserted /// allocations outside of all Scops. /// diff --git a/polly/lib/Analysis/DependenceInfo.cpp b/polly/lib/Analysis/DependenceInfo.cpp index c620f40ad0724..5183fc5725ece 100644 --- a/polly/lib/Analysis/DependenceInfo.cpp +++ b/polly/lib/Analysis/DependenceInfo.cpp @@ -20,7 +20,6 @@ //===----------------------------------------------------------------------===// // #include "polly/DependenceInfo.h" -#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/Support/GICHelper.h" @@ -42,6 +41,10 @@ using namespace llvm; #include "polly/Support/PollyDebug.h" #define DEBUG_TYPE "polly-dependence" +namespace polly { +Dependences::AnalysisLevel OptAnalysisLevel; +} + static cl::opt<int> OptComputeOut( "polly-dependences-computeout", cl::desc("Bound the dependence analysis by a maximal amount of " @@ -69,9 +72,10 @@ static cl::opt<enum AnalysisType> OptAnalysisType( "Overapproximation of dependences")), cl::Hidden, cl::init(VALUE_BASED_ANALYSIS), cl::cat(PollyCategory)); -static cl::opt<Dependences::AnalysisLevel> OptAnalysisLevel( +static cl::opt<Dependences::AnalysisLevel, true> XOptAnalysisLevel( "polly-dependences-analysis-level", cl::desc("The level of dependence analysis"), + cl::location(OptAnalysisLevel), cl::values(clEnumValN(Dependences::AL_Statement, "statement-wise", "Statement-level analysis"), clEnumValN(Dependences::AL_Reference, "reference-wise", @@ -881,213 +885,7 @@ DependenceInfoPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, return PreservedAnalyses::all(); } -const Dependences & -DependenceInfo::getDependences(Dependences::AnalysisLevel Level) { - if (Dependences *d = D[Level].get()) - return *d; - - return recomputeDependences(Level); -} - -const Dependences & -DependenceInfo::recomputeDependences(Dependences::AnalysisLevel Level) { - D[Level].reset(new Dependences(S->getSharedIslCtx(), Level)); - D[Level]->calculateDependences(*S); - return *D[Level]; -} - -void DependenceInfo::abandonDependences() { - for (std::unique_ptr<Dependences> &Deps : D) - Deps.release(); -} - -bool DependenceInfo::runOnScop(Scop &ScopVar) { - S = &ScopVar; - return false; -} - -/// Print the dependences for the given SCoP to @p OS. - -void polly::DependenceInfo::printScop(raw_ostream &OS, Scop &S) const { - if (auto d = D[OptAnalysisLevel].get()) { - d->print(OS); - return; - } - - // Otherwise create the dependences on-the-fly and print it - Dependences D(S.getSharedIslCtx(), OptAnalysisLevel); - D.calculateDependences(S); - D.print(OS); -} - -void DependenceInfo::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequiredTransitive<ScopInfoRegionPass>(); - AU.setPreservesAll(); -} - -char DependenceInfo::ID = 0; - -Pass *polly::createDependenceInfoPass() { return new DependenceInfo(); } - -INITIALIZE_PASS_BEGIN(DependenceInfo, "polly-dependences", - "Polly - Calculate dependences", false, false); -INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); -INITIALIZE_PASS_END(DependenceInfo, "polly-dependences", - "Polly - Calculate dependences", false, false) - -//===----------------------------------------------------------------------===// - -namespace { -/// Print result from DependenceAnalysis. -class DependenceInfoPrinterLegacyPass final : public ScopPass { -public: - static char ID; - - DependenceInfoPrinterLegacyPass() : DependenceInfoPrinterLegacyPass(outs()) {} - - explicit DependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS) - : ScopPass(ID), OS(OS) {} - - bool runOnScop(Scop &S) override { - DependenceInfo &P = getAnalysis<DependenceInfo>(); - - OS << "Printing analysis '" << P.getPassName() << "' for " - << "region: '" << S.getRegion().getNameStr() << "' in function '" - << S.getFunction().getName() << "':\n"; - P.printScop(OS, S); - - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<DependenceInfo>(); - AU.setPreservesAll(); - } - -private: - llvm::raw_ostream &OS; -}; - -char DependenceInfoPrinterLegacyPass::ID = 0; -} // namespace - -Pass *polly::createDependenceInfoPrinterLegacyPass(raw_ostream &OS) { - return new DependenceInfoPrinterLegacyPass(OS); -} - -INITIALIZE_PASS_BEGIN(DependenceInfoPrinterLegacyPass, - "polly-print-dependences", "Polly - Print dependences", - false, false); -INITIALIZE_PASS_DEPENDENCY(DependenceInfo); -INITIALIZE_PASS_END(DependenceInfoPrinterLegacyPass, "polly-print-dependences", - "Polly - Print dependences", false, false) - -//===----------------------------------------------------------------------===// - -const Dependences & -DependenceInfoWrapperPass::getDependences(Scop *S, - Dependences::AnalysisLevel Level) { - auto It = ScopToDepsMap.find(S); - if (It != ScopToDepsMap.end()) - if (It->second) { - if (It->second->getDependenceLevel() == Level) - return *It->second; - } - return recomputeDependences(S, Level); -} - -const Dependences &DependenceInfoWrapperPass::recomputeDependences( - Scop *S, Dependences::AnalysisLevel Level) { - std::unique_ptr<Dependences> D(new Dependences(S->getSharedIslCtx(), Level)); - D->calculateDependences(*S); - auto Inserted = ScopToDepsMap.insert(std::make_pair(S, std::move(D))); - return *Inserted.first->second; +DependenceAnalysis::Result polly::runDependenceAnalysis(Scop &S) { + DependenceAnalysis::Result Result{S, {}}; + return Result; } - -bool DependenceInfoWrapperPass::runOnFunction(Function &F) { - auto &SI = *getAnalysis<ScopInfoWrapperPass>().getSI(); - for (auto &It : SI) { - assert(It.second && "Invalid SCoP object!"); - recomputeDependences(It.second.get(), Dependences::AL_Access); - } - return false; -} - -void DependenceInfoWrapperPass::print(raw_ostream &OS, const Module *M) const { - for (auto &It : ScopToDepsMap) { - assert((It.first && It.second) && "Invalid Scop or Dependence object!\n"); - It.second->print(OS); - } -} - -void DependenceInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequiredTransitive<ScopInfoWrapperPass>(); - AU.setPreservesAll(); -} - -char DependenceInfoWrapperPass::ID = 0; - -Pass *polly::createDependenceInfoWrapperPassPass() { - return new DependenceInfoWrapperPass(); -} - -INITIALIZE_PASS_BEGIN( - DependenceInfoWrapperPass, "polly-function-dependences", - "Polly - Calculate dependences for all the SCoPs of a function", false, - false) -INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass); -INITIALIZE_PASS_END( - DependenceInfoWrapperPass, "polly-function-dependences", - "Polly - Calculate dependences for all the SCoPs of a function", false, - false) - -//===----------------------------------------------------------------------===// - -namespace { -/// Print result from DependenceInfoWrapperPass. -class DependenceInfoPrinterLegacyFunctionPass final : public FunctionPass { -public: - static char ID; - - DependenceInfoPrinterLegacyFunctionPass() - : DependenceInfoPrinterLegacyFunctionPass(outs()) {} - - explicit DependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS) - : FunctionPass(ID), OS(OS) {} - - bool runOnFunction(Function &F) override { - DependenceInfoWrapperPass &P = getAnalysis<DependenceInfoWrapperPass>(); - - OS << "Printing analysis '" << P.getPassName() << "' for function '" - << F.getName() << "':\n"; - P.print(OS); - - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - FunctionPass::getAnalysisUsage(AU); - AU.addRequired<DependenceInfoWrapperPass>(); - AU.setPreservesAll(); - } - -private: - llvm::raw_ostream &OS; -}; - -char DependenceInfoPrinterLegacyFunctionPass::ID = 0; -} // namespace - -Pass *polly::createDependenceInfoPrinterLegacyFunctionPass(raw_ostream &OS) { - return new DependenceInfoPrinterLegacyFunctionPass(OS); -} - -INITIALIZE_PASS_BEGIN( - DependenceInfoPrinterLegacyFunctionPass, "polly-print-function-dependences", - "Polly - Print dependences for all the SCoPs of a function", false, false); -INITIALIZE_PASS_DEPENDENCY(DependenceInfoWrapperPass); -INITIALIZE_PASS_END(DependenceInfoPrinterLegacyFunctionPass, - "polly-print-function-dependences", - "Polly - Print dependences for all the SCoPs of a function", - false, false) diff --git a/polly/lib/Analysis/PruneUnprofitable.cpp b/polly/lib/Analysis/PruneUnprofitable.cpp index f8469c03fe55b..40cc9178da0f3 100644 --- a/polly/lib/Analysis/PruneUnprofitable.cpp +++ b/polly/lib/Analysis/PruneUnprofitable.cpp @@ -55,8 +55,9 @@ static void updateStatistics(Scop &S, bool Pruned) { NumAffineLoops += ScopStats.NumAffineLoops; } } +} // namespace -static bool runPruneUnprofitable(Scop &S) { +bool polly::runPruneUnprofitable(Scop &S) { if (PollyProcessUnprofitable) { POLLY_DEBUG( dbgs() << "NOTE: -polly-process-unprofitable active, won't prune " @@ -79,35 +80,6 @@ static bool runPruneUnprofitable(Scop &S) { return false; } -class PruneUnprofitableWrapperPass final : public ScopPass { -public: - static char ID; - - explicit PruneUnprofitableWrapperPass() : ScopPass(ID) {} - PruneUnprofitableWrapperPass(const PruneUnprofitableWrapperPass &) = delete; - PruneUnprofitableWrapperPass & - operator=(const PruneUnprofitableWrapperPass &) = delete; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<ScopInfoRegionPass>(); - AU.setPreservesAll(); - } - - bool runOnScop(Scop &S) override { return runPruneUnprofitable(S); } -}; -} // namespace - -char PruneUnprofitableWrapperPass::ID; - -Pass *polly::createPruneUnprofitableWrapperPass() { - return new PruneUnprofitableWrapperPass(); -} - -INITIALIZE_PASS_BEGIN(PruneUnprofitableWrapperPass, "polly-prune-unprofitable", - "Polly - Prune unprofitable SCoPs", false, false) -INITIALIZE_PASS_END(PruneUnprofitableWrapperPass, "polly-prune-unprofitable", - "Polly - Prune unprofitable SCoPs", false, false) - llvm::PreservedAnalyses PruneUnprofitablePass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &U) { diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp index 67a4c43455809..60a1e00916750 100644 --- a/polly/lib/Analysis/ScopBuilder.cpp +++ b/polly/lib/Analysis/ScopBuilder.cpp @@ -56,6 +56,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <cassert> +#include <deque> using namespace llvm; using namespace polly; diff --git a/polly/lib/Analysis/ScopDetection.cpp b/polly/lib/Analysis/ScopDetection.cpp index 43ed8636b054b..29e89348125f2 100644 --- a/polly/lib/Analysis/ScopDetection.cpp +++ b/polly/lib/Analysis/ScopDetection.cpp @@ -44,7 +44,6 @@ //===----------------------------------------------------------------------===// #include "polly/ScopDetection.h" -#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopDetectionDiagnostic.h" #include "polly/Support/SCEVValidator.h" @@ -75,8 +74,6 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Regex.h" #include "llvm/Support/raw_ostream.h" @@ -1983,53 +1980,12 @@ void ScopDetection::verifyAnalysis() { verifyRegion(*R); } -bool ScopDetectionWrapperPass::runOnFunction(Function &F) { - auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - auto &RI = getAnalysis<RegionInfoPass>().getRegionInfo(); - auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); - auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - - Result = std::make_unique<ScopDetection>(DT, SE, LI, RI, AA, ORE); - Result->detect(F); - return false; -} - -void ScopDetectionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequiredTransitive<ScalarEvolutionWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); - // We also need AA and RegionInfo when we are verifying analysis. - AU.addRequiredTransitive<AAResultsWrapperPass>(); - AU.addRequiredTransitive<RegionInfoPass>(); - AU.setPreservesAll(); -} - -void ScopDetectionWrapperPass::print(raw_ostream &OS, const Module *) const { - for (const Region *R : Result->ValidRegions) - OS << "Valid Region for Scop: " << R->getNameStr() << '\n'; - - OS << "\n"; -} - -ScopDetectionWrapperPass::ScopDetectionWrapperPass() : FunctionPass(ID) { - // Disable runtime alias checks if we ignore aliasing all together. - if (IgnoreAliasing) - PollyUseRuntimeAliasChecks = false; -} - ScopAnalysis::ScopAnalysis() { // Disable runtime alias checks if we ignore aliasing all together. if (IgnoreAliasing) PollyUseRuntimeAliasChecks = false; } -void ScopDetectionWrapperPass::releaseMemory() { Result.reset(); } - -char ScopDetectionWrapperPass::ID; - AnalysisKey ScopAnalysis::Key; ScopDetection ScopAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { @@ -2055,66 +2011,3 @@ PreservedAnalyses ScopAnalysisPrinterPass::run(Function &F, OS << "\n"; return PreservedAnalyses::all(); } - -Pass *polly::createScopDetectionWrapperPassPass() { - return new ScopDetectionWrapperPass(); -} - -INITIALIZE_PASS_BEGIN(ScopDetectionWrapperPass, "polly-detect", - "Polly - Detect static control parts (SCoPs)", false, - false); -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass); -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); -INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); -INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); -INITIALIZE_PASS_END(ScopDetectionWrapperPass, "polly-detect", - "Polly - Detect static control parts (SCoPs)", false, false) - -//===----------------------------------------------------------------------===// - -namespace { -/// Print result from ScopDetectionWrapperPass. -class ScopDetectionPrinterLegacyPass final : public FunctionPass { -public: - static char ID; - - ScopDetectionPrinterLegacyPass() : ScopDetectionPrinterLegacyPass(outs()) {} - - explicit ScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS) - : FunctionPass(ID), OS(OS) {} - - bool runOnFunction(Function &F) override { - ScopDetectionWrapperPass &P = getAnalysis<ScopDetectionWrapperPass>(); - - OS << "Printing analysis '" << P.getPassName() << "' for function '" - << F.getName() << "':\n"; - P.print(OS); - - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - FunctionPass::getAnalysisUsage(AU); - AU.addRequired<ScopDetectionWrapperPass>(); - AU.setPreservesAll(); - } - -private: - llvm::raw_ostream &OS; -}; - -char ScopDetectionPrinterLegacyPass::ID = 0; -} // namespace - -Pass *polly::createScopDetectionPrinterLegacyPass(raw_ostream &OS) { - return new ScopDetectionPrinterLegacyPass(OS); -} - -INITIALIZE_PASS_BEGIN(ScopDetectionPrinterLegacyPass, "polly-print-detect", - "Polly - Print static control parts (SCoPs)", false, - false); -INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); -INITIALIZE_PASS_END(ScopDetectionPrinterLegacyPass, "polly-print-detect", - "Polly - Print static control parts (SCoPs)", false, false) diff --git a/polly/lib/Analysis/ScopGraphPrinter.cpp b/polly/lib/Analysis/ScopGraphPrinter.cpp index eb6c995f0bb91..29e212882cefe 100644 --- a/polly/lib/Analysis/ScopGraphPrinter.cpp +++ b/polly/lib/Analysis/ScopGraphPrinter.cpp @@ -14,20 +14,26 @@ //===----------------------------------------------------------------------===// #include "polly/ScopGraphPrinter.h" -#include "polly/LinkAllPasses.h" #include "polly/ScopDetection.h" #include "llvm/Support/CommandLine.h" using namespace polly; using namespace llvm; -static cl::opt<std::string> - ViewFilter("polly-view-only", - cl::desc("Only view functions that match this pattern"), - cl::Hidden, cl::init("")); -static cl::opt<bool> ViewAll("polly-view-all", - cl::desc("Also show functions without any scops"), - cl::Hidden, cl::init(false)); +namespace polly { +std::string ViewFilter; +bool ViewAll; +} // namespace polly + +static cl::opt<std::string, true> + XViewFilter("polly-view-only", + cl::desc("Only view functions that match this pattern"), + cl::location(ViewFilter), cl::Hidden, cl::init("")); + +static cl::opt<bool, true> + XViewAll("polly-view-all", + cl::desc("Also show functions without any scops"), + cl::location(ViewAll), cl::Hidden, cl::init(false)); namespace llvm { @@ -134,104 +140,6 @@ void DOTGraphTraits<ScopDetection *>::addCustomGraphFeatures( } // namespace llvm -struct ScopDetectionAnalysisGraphTraits { - static ScopDetection *getGraph(ScopDetectionWrapperPass *Analysis) { - return &Analysis->getSD(); - } -}; - -struct ScopViewerWrapperPass - : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false, - ScopDetection *, - ScopDetectionAnalysisGraphTraits> { - static char ID; - ScopViewerWrapperPass() - : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false, - ScopDetection *, - ScopDetectionAnalysisGraphTraits>( - "scops", ID) {} - bool processFunction(Function &F, ScopDetectionWrapperPass &SD) override { - if (ViewFilter != "" && !F.getName().count(ViewFilter)) - return false; - - if (ViewAll) - return true; - - // Check that at least one scop was detected. - return std::distance(SD.getSD().begin(), SD.getSD().end()) > 0; - } -}; -char ScopViewerWrapperPass::ID = 0; - -struct ScopOnlyViewerWrapperPass - : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false, - ScopDetection *, - ScopDetectionAnalysisGraphTraits> { - static char ID; - ScopOnlyViewerWrapperPass() - : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false, - ScopDetection *, - ScopDetectionAnalysisGraphTraits>( - "scopsonly", ID) {} -}; -char ScopOnlyViewerWrapperPass::ID = 0; - -struct ScopPrinterWrapperPass - : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, false, - ScopDetection *, - ScopDetectionAnalysisGraphTraits> { - static char ID; - ScopPrinterWrapperPass() - : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, false, - ScopDetection *, - ScopDetectionAnalysisGraphTraits>( - "scops", ID) {} -}; -char ScopPrinterWrapperPass::ID = 0; - -struct ScopOnlyPrinterWrapperPass - : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, true, - ScopDetection *, - ScopDetectionAnalysisGraphTraits> { - static char ID; - ScopOnlyPrinterWrapperPass() - : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, true, - ScopDetection *, - ScopDetectionAnalysisGraphTraits>( - "scopsonly", ID) {} -}; -char ScopOnlyPrinterWrapperPass::ID = 0; - -static RegisterPass<ScopViewerWrapperPass> X("view-scops", - "Polly - View Scops of function"); - -static RegisterPass<ScopOnlyViewerWrapperPass> - Y("view-scops-only", - "Polly - View Scops of function (with no function bodies)"); - -static RegisterPass<ScopPrinterWrapperPass> - M("dot-scops", "Polly - Print Scops of function"); - -static RegisterPass<ScopOnlyPrinterWrapperPass> - N("dot-scops-only", - "Polly - Print Scops of function (with no function bodies)"); - -Pass *polly::createDOTViewerWrapperPass() { - return new ScopViewerWrapperPass(); -} - -Pass *polly::createDOTOnlyViewerWrapperPass() { - return new ScopOnlyViewerWrapperPass(); -} - -Pass *polly::createDOTPrinterWrapperPass() { - return new ScopPrinterWrapperPass(); -} - -Pass *polly::createDOTOnlyPrinterWrapperPass() { - return new ScopOnlyPrinterWrapperPass(); -} - bool ScopViewer::processFunction(Function &F, const ScopDetection &SD) { if (ViewFilter != "" && !F.getName().count(ViewFilter)) return false; diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp index 8c6a2360a249b..70e184d3f897f 100644 --- a/polly/lib/Analysis/ScopInfo.cpp +++ b/polly/lib/Analysis/ScopInfo.cpp @@ -17,7 +17,6 @@ //===----------------------------------------------------------------------===// #include "polly/ScopInfo.h" -#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopBuilder.h" #include "polly/ScopDetection.h" @@ -57,7 +56,6 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" -#include "llvm/InitializePasses.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -2544,19 +2542,6 @@ raw_ostream &polly::operator<<(raw_ostream &OS, const Scop &scop) { return OS; } -//===----------------------------------------------------------------------===// -void ScopInfoRegionPass::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<RegionInfoPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequiredTransitive<ScalarEvolutionWrapperPass>(); - AU.addRequiredTransitive<ScopDetectionWrapperPass>(); - AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); - AU.setPreservesAll(); -} - void updateLoopCountStatistic(ScopDetection::LoopStats Stats, Scop::ScopStatistics ScopStats) { assert(Stats.NumLoops == ScopStats.NumAffineLoops + ScopStats.NumBoxedLoops); @@ -2592,112 +2577,6 @@ void updateLoopCountStatistic(ScopDetection::LoopStats Stats, NumSingletonWritesInLoops += ScopStats.NumSingletonWritesInLoops; } -bool ScopInfoRegionPass::runOnRegion(Region *R, RGPassManager &RGM) { - auto &SD = getAnalysis<ScopDetectionWrapperPass>().getSD(); - - if (!SD.isMaxRegionInScop(*R)) - return false; - - Function *F = R->getEntry()->getParent(); - auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); - auto const &DL = F->getParent()->getDataLayout(); - auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F); - auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - - ScopBuilder SB(R, AC, AA, DL, DT, LI, SD, SE, ORE); - S = SB.getScop(); // take ownership of scop object - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS) - if (S) { - ScopDetection::LoopStats Stats = - ScopDetection::countBeneficialLoops(&S->getRegion(), SE, LI, 0); - updateLoopCountStatistic(Stats, S->getStatistics()); - } -#endif - - return false; -} - -void ScopInfoRegionPass::print(raw_ostream &OS, const Module *) const { - if (S) - S->print(OS, PollyPrintInstructions); - else - OS << "Invalid Scop!\n"; -} - -char ScopInfoRegionPass::ID = 0; - -Pass *polly::createScopInfoRegionPassPass() { return new ScopInfoRegionPass(); } - -INITIALIZE_PASS_BEGIN(ScopInfoRegionPass, "polly-scops", - "Polly - Create polyhedral description of Scops", false, - false); -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass); -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker); -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); -INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); -INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); -INITIALIZE_PASS_END(ScopInfoRegionPass, "polly-scops", - "Polly - Create polyhedral description of Scops", false, - false) - -//===----------------------------------------------------------------------===// - -namespace { - -/// Print result from ScopInfoRegionPass. -class ScopInfoPrinterLegacyRegionPass final : public RegionPass { -public: - static char ID; - - ScopInfoPrinterLegacyRegionPass() : ScopInfoPrinterLegacyRegionPass(outs()) {} - - explicit ScopInfoPrinterLegacyRegionPass(llvm::raw_ostream &OS) - : RegionPass(ID), OS(OS) {} - - bool runOnRegion(Region *R, RGPassManager &RGM) override { - ScopInfoRegionPass &P = getAnalysis<ScopInfoRegionPass>(); - - OS << "Printing analysis '" << P.getPassName() << "' for region: '" - << R->getNameStr() << "' in function '" - << R->getEntry()->getParent()->getName() << "':\n"; - P.print(OS); - - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - RegionPass::getAnalysisUsage(AU); - AU.addRequired<ScopInfoRegionPass>(); - AU.setPreservesAll(); - } - -private: - llvm::raw_ostream &OS; -}; - -char ScopInfoPrinterLegacyRegionPass::ID = 0; -} // namespace - -Pass *polly::createScopInfoPrinterLegacyRegionPass(raw_ostream &OS) { - return new ScopInfoPrinterLegacyRegionPass(OS); -} - -INITIALIZE_PASS_BEGIN(ScopInfoPrinterLegacyRegionPass, "polly-print-scops", - "Polly - Print polyhedral description of Scops", false, - false); -INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); -INITIALIZE_PASS_END(ScopInfoPrinterLegacyRegionPass, "polly-print-scops", - "Polly - Print polyhedral description of Scops", false, - false) - -//===----------------------------------------------------------------------===// - ScopInfo::ScopInfo(const DataLayout &DL, ScopDetection &SD, ScalarEvolution &SE, LoopInfo &LI, AliasAnalysis &AA, DominatorTree &DT, AssumptionCache &AC, OptimizationRemarkEmitter &ORE) @@ -2771,110 +2650,3 @@ PreservedAnalyses ScopInfoPrinterPass::run(Function &F, } return PreservedAnalyses::all(); } - -void ScopInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<RegionInfoPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequiredTransitive<ScalarEvolutionWrapperPass>(); - AU.addRequiredTransitive<ScopDetectionWrapperPass>(); - AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); - AU.setPreservesAll(); -} - -bool ScopInfoWrapperPass::runOnFunction(Function &F) { - auto &SD = getAnalysis<ScopDetectionWrapperPass>().getSD(); - auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); - auto const &DL = F.getParent()->getDataLayout(); - auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - - Result.reset(new ScopInfo{DL, SD, SE, LI, AA, DT, AC, ORE}); - return false; -} - -void ScopInfoWrapperPass::print(raw_ostream &OS, const Module *) const { - for (auto &It : *Result) { - if (It.second) - It.second->print(OS, PollyPrintInstructions); - else - OS << "Invalid Scop!\n"; - } -} - -char ScopInfoWrapperPass::ID = 0; - -Pass *polly::createScopInfoWrapperPassPass() { - return new ScopInfoWrapperPass(); -} - -INITIALIZE_PASS_BEGIN( - ScopInfoWrapperPass, "polly-function-scops", - "Polly - Create polyhedral description of all Scops of a function", false, - false); -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass); -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker); -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); -INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); -INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); -INITIALIZE_PASS_END( - ScopInfoWrapperPass, "polly-function-scops", - "Polly - Create polyhedral description of all Scops of a function", false, - false) - -//===----------------------------------------------------------------------===// - -namespace { -/// Print result from ScopInfoWrapperPass. -class ScopInfoPrinterLegacyFunctionPass final : public FunctionPass { -public: - static char ID; - - ScopInfoPrinterLegacyFunctionPass() - : ScopInfoPrinterLegacyFunctionPass(outs()) {} - explicit ScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS) - : FunctionPass(ID), OS(OS) {} - - bool runOnFunction(Function &F) override { - ScopInfoWrapperPass &P = getAnalysis<ScopInfoWrapperPass>(); - - OS << "Printing analysis '" << P.getPassName() << "' for function '" - << F.getName() << "':\n"; - P.print(OS); - - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - FunctionPass::getAnalysisUsage(AU); - AU.addRequired<ScopInfoWrapperPass>(); - AU.setPreservesAll(); - } - -private: - llvm::raw_ostream &OS; -}; - -char ScopInfoPrinterLegacyFunctionPass::ID = 0; -} // namespace - -Pass *polly::createScopInfoPrinterLegacyFunctionPass(raw_ostream &OS) { - return new ScopInfoPrinterLegacyFunctionPass(OS); -} - -INITIALIZE_PASS_BEGIN( - ScopInfoPrinterLegacyFunctionPass, "polly-print-function-scops", - "Polly - Print polyhedral description of all Scops of a function", false, - false); -INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass); -INITIALIZE_PASS_END( - ScopInfoPrinterLegacyFunctionPass, "polly-print-function-scops", - "Polly - Print polyhedral description of all Scops of a function", false, - false) diff --git a/polly/lib/Analysis/ScopPass.cpp b/polly/lib/Analysis/ScopPass.cpp index 719cd0f6984e0..61417e799cfa5 100644 --- a/polly/lib/Analysis/ScopPass.cpp +++ b/polly/lib/Analysis/ScopPass.cpp @@ -24,42 +24,6 @@ using namespace llvm; using namespace polly; -bool ScopPass::runOnRegion(Region *R, RGPassManager &RGM) { - S = nullptr; - - if (skipRegion(*R)) - return false; - - if ((S = getAnalysis<ScopInfoRegionPass>().getScop())) - return runOnScop(*S); - - return false; -} - -void ScopPass::print(raw_ostream &OS, const Module *M) const { - if (S) - printScop(OS, *S); -} - -void ScopPass::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<ScopInfoRegionPass>(); - - AU.addPreserved<AAResultsWrapperPass>(); - AU.addPreserved<BasicAAWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<ScopDetectionWrapperPass>(); - AU.addPreserved<ScalarEvolutionWrapperPass>(); - AU.addPreserved<SCEVAAWrapperPass>(); - AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); - AU.addPreserved<LazyBlockFrequencyInfoPass>(); - AU.addPreserved<LazyBranchProbabilityInfoPass>(); - AU.addPreserved<RegionInfoPass>(); - AU.addPreserved<ScopInfoRegionPass>(); - AU.addPreserved<TargetTransformInfoWrapperPass>(); -} - namespace polly { template class OwningInnerAnalysisManagerProxy<ScopAnalysisManager, Function>; } diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt index 0ed673815ff34..e4f196f151c9e 100644 --- a/polly/lib/CMakeLists.txt +++ b/polly/lib/CMakeLists.txt @@ -60,6 +60,9 @@ add_llvm_pass_plugin(Polly CodeGen/RuntimeDebugBuilder.cpp CodeGen/PerfMonitor.cpp Exchange/JSONExporter.cpp + Pass/PhaseManager.cpp + Pass/PollyFunctionPass.cpp + Pass/PollyModulePass.cpp Support/GICHelper.cpp Support/PollyDebug.cpp Support/SCEVAffinator.cpp diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp index 2d8b393cc039c..f2d5a3422849e 100644 --- a/polly/lib/CodeGen/CodeGeneration.cpp +++ b/polly/lib/CodeGen/CodeGeneration.cpp @@ -25,7 +25,6 @@ #include "polly/CodeGen/PerfMonitor.h" #include "polly/CodeGen/Utils.h" #include "polly/DependenceInfo.h" -#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/Support/ScopHelper.h" @@ -37,7 +36,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Verifier.h" -#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -314,59 +312,6 @@ static bool generateCode(Scop &S, IslAstInfo &AI, LoopInfo &LI, return true; } -namespace { - -class CodeGeneration final : public ScopPass { -public: - static char ID; - - /// The data layout used. - const DataLayout *DL; - - /// @name The analysis passes we need to generate code. - /// - ///{ - LoopInfo *LI; - IslAstInfo *AI; - DominatorTree *DT; - ScalarEvolution *SE; - RegionInfo *RI; - ///} - - CodeGeneration() : ScopPass(ID) {} - - /// Generate LLVM-IR for the SCoP @p S. - bool runOnScop(Scop &S) override { - AI = &getAnalysis<IslAstInfoWrapperPass>().getAI(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - DL = &S.getFunction().getDataLayout(); - RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); - return generateCode(S, *AI, *LI, *DT, *SE, *RI); - } - - /// Register all analyses and transformation required. - void getAnalysisUsage(AnalysisUsage &AU) const override { - ScopPass::getAnalysisUsage(AU); - - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<IslAstInfoWrapperPass>(); - AU.addRequired<RegionInfoPass>(); - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequired<ScopDetectionWrapperPass>(); - AU.addRequired<ScopInfoRegionPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - - AU.addPreserved<DependenceInfo>(); - AU.addPreserved<IslAstInfoWrapperPass>(); - - // FIXME: We do not yet add regions for the newly generated code to the - // region tree. - } -}; -} // namespace - PreservedAnalyses CodeGenerationPass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &AR, SPMUpdater &U) { @@ -379,17 +324,6 @@ PreservedAnalyses CodeGenerationPass::run(Scop &S, ScopAnalysisManager &SAM, return PreservedAnalyses::all(); } -char CodeGeneration::ID = 1; - -Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); } - -INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen", - "Polly - Create LLVM-IR from SCoPs", false, false); -INITIALIZE_PASS_DEPENDENCY(DependenceInfo); -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); -INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); -INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); -INITIALIZE_PASS_END(CodeGeneration, "polly-codegen", - "Polly - Create LLVM-IR from SCoPs", false, false) +bool polly::runCodeGeneration(Scop &S, RegionInfo &RI, IslAstInfo &AI) { + return generateCode(S, AI, *S.getLI(), *S.getDT(), *S.getSE(), RI); +} diff --git a/polly/lib/CodeGen/IslAst.cpp b/polly/lib/CodeGen/IslAst.cpp index 09bacda196742..3177cda225f1d 100644 --- a/polly/lib/CodeGen/IslAst.cpp +++ b/polly/lib/CodeGen/IslAst.cpp @@ -29,7 +29,6 @@ #include "polly/CodeGen/IslAst.h" #include "polly/CodeGen/CodeGeneration.h" #include "polly/DependenceInfo.h" -#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopDetection.h" #include "polly/ScopInfo.h" @@ -83,6 +82,11 @@ static cl::opt<bool> DetectParallel("polly-ast-detect-parallel", cl::desc("Detect parallelism"), cl::Hidden, cl::cat(PollyCategory)); +static cl::opt<bool> + PollyPrintAst("polly-print-ast", + cl::desc("Print the ISL abstract syntax tree"), + cl::cat(PollyCategory)); + STATISTIC(ScopsProcessed, "Number of SCoPs processed"); STATISTIC(ScopsBeneficial, "Number of beneficial SCoPs"); STATISTIC(BeneficialAffineLoops, "Number of beneficial affine loops"); @@ -776,90 +780,19 @@ PreservedAnalyses IslAstPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, return PreservedAnalyses::all(); } -void IslAstInfoWrapperPass::releaseMemory() { Ast.reset(); } - -bool IslAstInfoWrapperPass::runOnScop(Scop &Scop) { - auto GetDeps = [this](Dependences::AnalysisLevel Lvl) -> const Dependences & { - return getAnalysis<DependenceInfo>().getDependences(Lvl); +std::unique_ptr<IslAstInfo> +polly::runIslAstGen(Scop &S, DependenceAnalysis::Result &DA) { + auto GetDeps = [&](Dependences::AnalysisLevel Lvl) -> const Dependences & { + return DA.getDependences(Lvl); }; - Ast = runIslAst(Scop, GetDeps); - - return false; -} - -void IslAstInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { - // Get the Common analysis usage of ScopPasses. - ScopPass::getAnalysisUsage(AU); - AU.addRequiredTransitive<ScopInfoRegionPass>(); - AU.addRequired<DependenceInfo>(); - - AU.addPreserved<DependenceInfo>(); -} - -void IslAstInfoWrapperPass::printScop(raw_ostream &OS, Scop &S) const { - OS << "Printing analysis 'Polly - Generate an AST of the SCoP (isl)'" - << S.getName() << "' in function '" << S.getFunction().getName() << "':\n"; - if (Ast) - Ast->print(OS); -} - -char IslAstInfoWrapperPass::ID = 0; - -Pass *polly::createIslAstInfoWrapperPassPass() { - return new IslAstInfoWrapperPass(); -} - -INITIALIZE_PASS_BEGIN(IslAstInfoWrapperPass, "polly-ast", - "Polly - Generate an AST of the SCoP (isl)", false, - false); -INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); -INITIALIZE_PASS_DEPENDENCY(DependenceInfo); -INITIALIZE_PASS_END(IslAstInfoWrapperPass, "polly-ast", - "Polly - Generate an AST from the SCoP (isl)", false, false) - -//===----------------------------------------------------------------------===// - -namespace { -/// Print result from IslAstInfoWrapperPass. -class IslAstInfoPrinterLegacyPass final : public ScopPass { -public: - static char ID; - - IslAstInfoPrinterLegacyPass() : IslAstInfoPrinterLegacyPass(outs()) {} - explicit IslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS) - : ScopPass(ID), OS(OS) {} - - bool runOnScop(Scop &S) override { - IslAstInfoWrapperPass &P = getAnalysis<IslAstInfoWrapperPass>(); - - OS << "Printing analysis '" << P.getPassName() << "' for region: '" - << S.getRegion().getNameStr() << "' in function '" - << S.getFunction().getName() << "':\n"; - P.printScop(OS, S); - - return false; + std::unique_ptr<IslAstInfo> Result = runIslAst(S, GetDeps); + if (PollyPrintAst) { + outs() << "Printing analysis 'Polly - Generate an AST of the SCoP (isl)'" + << S.getName() << "' in function '" << S.getFunction().getName() + << "':\n"; + if (Result) + Result->print(llvm::outs()); } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<IslAstInfoWrapperPass>(); - AU.setPreservesAll(); - } - -private: - llvm::raw_ostream &OS; -}; - -char IslAstInfoPrinterLegacyPass::ID = 0; -} // namespace - -Pass *polly::createIslAstInfoPrinterLegacyPass(raw_ostream &OS) { - return new IslAstInfoPrinterLegacyPass(OS); + return Result; } - -INITIALIZE_PASS_BEGIN(IslAstInfoPrinterLegacyPass, "polly-print-ast", - "Polly - Print the AST from a SCoP (isl)", false, false); -INITIALIZE_PASS_DEPENDENCY(IslAstInfoWrapperPass); -INITIALIZE_PASS_END(IslAstInfoPrinterLegacyPass, "polly-print-ast", - "Polly - Print the AST from a SCoP (isl)", false, false) diff --git a/polly/lib/Exchange/JSONExporter.cpp b/polly/lib/Exchange/JSONExporter.cpp index dfd63146edb5e..7d30c030aa6e1 100644 --- a/polly/lib/Exchange/JSONExporter.cpp +++ b/polly/lib/Exchange/JSONExporter.cpp @@ -12,7 +12,6 @@ #include "polly/JSONExporter.h" #include "polly/DependenceInfo.h" -#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/ScopPass.h" @@ -36,6 +35,11 @@ using namespace polly; #define DEBUG_TYPE "polly-import-jscop" +static cl::opt<bool> + PollyPrintImportJscop("polly-print-import-jscop", + cl::desc("Polly - Print Scop import result"), + cl::cat(PollyCategory)); + STATISTIC(NewAccessMapFound, "Number of updated access functions"); namespace { @@ -50,36 +54,6 @@ static cl::opt<std::string> cl::desc("Postfix to append to the import .jsop files."), cl::Hidden, cl::value_desc("File postfix"), cl::ValueRequired, cl::init(""), cl::cat(PollyCategory)); - -class JSONExporter : public ScopPass { -public: - static char ID; - explicit JSONExporter() : ScopPass(ID) {} - - /// Export the SCoP @p S to a JSON file. - bool runOnScop(Scop &S) override; - - /// Print the SCoP @p S as it is exported. - void printScop(raw_ostream &OS, Scop &S) const override; - - /// Register all analyses and transformation required. - void getAnalysisUsage(AnalysisUsage &AU) const override; -}; - -class JSONImporter : public ScopPass { -public: - static char ID; - std::vector<std::string> NewAccessStrings; - explicit JSONImporter() : ScopPass(ID) {} - /// Import new access functions for SCoP @p S from a JSON file. - bool runOnScop(Scop &S) override; - - /// Print the SCoP @p S and the imported access functions. - void printScop(raw_ostream &OS, Scop &S) const override; - - /// Register all analyses and transformation required. - void getAnalysisUsage(AnalysisUsage &AU) const override; -}; } // namespace static std::string getFileName(Scop &S, StringRef Suffix = "") { @@ -742,21 +716,6 @@ static bool importScop(Scop &S, const Dependences &D, const DataLayout &DL, return true; } -char JSONExporter::ID = 0; -void JSONExporter::printScop(raw_ostream &OS, Scop &S) const { OS << S; } - -bool JSONExporter::runOnScop(Scop &S) { - exportScop(S); - return false; -} - -void JSONExporter::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesAll(); - AU.addRequired<ScopInfoRegionPass>(); -} - -Pass *polly::createJSONExporterPass() { return new JSONExporter(); } - PreservedAnalyses JSONExportPass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &) { @@ -764,37 +723,6 @@ PreservedAnalyses JSONExportPass::run(Scop &S, ScopAnalysisManager &SAM, return PreservedAnalyses::all(); } -char JSONImporter::ID = 0; - -void JSONImporter::printScop(raw_ostream &OS, Scop &S) const { - OS << S; - for (std::vector<std::string>::const_iterator I = NewAccessStrings.begin(), - E = NewAccessStrings.end(); - I != E; I++) - OS << "New access function '" << *I << "' detected in JSCOP file\n"; -} - -bool JSONImporter::runOnScop(Scop &S) { - const Dependences &D = - getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement); - const DataLayout &DL = S.getFunction().getParent()->getDataLayout(); - - if (!importScop(S, D, DL, &NewAccessStrings)) - report_fatal_error("Tried to import a malformed jscop file."); - - return false; -} - -void JSONImporter::getAnalysisUsage(AnalysisUsage &AU) const { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<DependenceInfo>(); - - // TODO: JSONImporter should throw away DependenceInfo. - AU.addPreserved<DependenceInfo>(); -} - -Pass *polly::createJSONImporterPass() { return new JSONImporter(); } - PreservedAnalyses JSONImportPass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &) { @@ -814,68 +742,24 @@ PreservedAnalyses JSONImportPass::run(Scop &S, ScopAnalysisManager &SAM, return PA; } -INITIALIZE_PASS_BEGIN(JSONExporter, "polly-export-jscop", - "Polly - Export Scops as JSON" - " (Writes a .jscop file for each Scop)", - false, false); -INITIALIZE_PASS_DEPENDENCY(DependenceInfo) -INITIALIZE_PASS_END(JSONExporter, "polly-export-jscop", - "Polly - Export Scops as JSON" - " (Writes a .jscop file for each Scop)", - false, false) - -INITIALIZE_PASS_BEGIN(JSONImporter, "polly-import-jscop", - "Polly - Import Scops from JSON" - " (Reads a .jscop file for each Scop)", - false, false); -INITIALIZE_PASS_DEPENDENCY(DependenceInfo) -INITIALIZE_PASS_END(JSONImporter, "polly-import-jscop", - "Polly - Import Scops from JSON" - " (Reads a .jscop file for each Scop)", - false, false) - -//===----------------------------------------------------------------------===// - -namespace { -/// Print result from JSONImporter. -class JSONImporterPrinterLegacyPass final : public ScopPass { -public: - static char ID; - - JSONImporterPrinterLegacyPass() : JSONImporterPrinterLegacyPass(outs()) {} - explicit JSONImporterPrinterLegacyPass(llvm::raw_ostream &OS) - : ScopPass(ID), OS(OS) {} - - bool runOnScop(Scop &S) override { - JSONImporter &P = getAnalysis<JSONImporter>(); - - OS << "Printing analysis '" << P.getPassName() << "' for region: '" - << S.getRegion().getNameStr() << "' in function '" - << S.getFunction().getName() << "':\n"; - P.printScop(OS, S); - - return false; - } +void polly::runImportJSON(Scop &S, DependenceAnalysis::Result &DA) { + const Dependences &D = DA.getDependences(Dependences::AL_Statement); + const DataLayout &DL = S.getFunction().getParent()->getDataLayout(); + std::vector<std::string> NewAccessStrings; + if (!importScop(S, D, DL, &NewAccessStrings)) + report_fatal_error("Tried to import a malformed jscop file."); - void getAnalysisUsage(AnalysisUsage &AU) const override { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<JSONImporter>(); - AU.setPreservesAll(); + if (PollyPrintImportJscop) { + outs() + << "Printing analysis 'Polly - Print Scop import result' for region: '" + << S.getRegion().getNameStr() << "' in function '" + << S.getFunction().getName() << "':\n"; + outs() << S; + for (std::vector<std::string>::const_iterator I = NewAccessStrings.begin(), + E = NewAccessStrings.end(); + I != E; I++) + outs() << "New access function '" << *I << "' detected in JSCOP file\n"; } - -private: - llvm::raw_ostream &OS; -}; - -char JSONImporterPrinterLegacyPass::ID = 0; -} // namespace - -Pass *polly::createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS) { - return new JSONImporterPrinterLegacyPass(OS); } -INITIALIZE_PASS_BEGIN(JSONImporterPrinterLegacyPass, "polly-print-import-jscop", - "Polly - Print Scop import result", false, false) -INITIALIZE_PASS_DEPENDENCY(JSONImporter) -INITIALIZE_PASS_END(JSONImporterPrinterLegacyPass, "polly-print-import-jscop", - "Polly - Print Scop import result", false, false) +void polly::runExportJSON(Scop &S) { exportScop(S); } diff --git a/polly/lib/Pass/PhaseManager.cpp b/polly/lib/Pass/PhaseManager.cpp new file mode 100644 index 0000000000000..fb76c811859b8 --- /dev/null +++ b/polly/lib/Pass/PhaseManager.cpp @@ -0,0 +1,424 @@ +//===------ PhaseManager.cpp ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "polly/Pass/PhaseManager.h" +#include "polly/CodeGen/CodeGeneration.h" +#include "polly/CodeGen/IslAst.h" +#include "polly/CodePreparation.h" +#include "polly/DeLICM.h" +#include "polly/DeadCodeElimination.h" +#include "polly/DependenceInfo.h" +#include "polly/FlattenSchedule.h" +#include "polly/ForwardOpTree.h" +#include "polly/JSONExporter.h" +#include "polly/MaximalStaticExpansion.h" +#include "polly/PruneUnprofitable.h" +#include "polly/ScheduleOptimizer.h" +#include "polly/ScopDetection.h" +#include "polly/ScopDetectionDiagnostic.h" +#include "polly/ScopGraphPrinter.h" +#include "polly/ScopInfo.h" +#include "polly/Simplify.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/IR/Module.h" + +#define DEBUG_TYPE "polly-pass" + +using namespace polly; +using namespace llvm; + +namespace { + +/// Recurse through all subregions and all regions and add them to RQ. +static void addRegionIntoQueue(Region &R, SmallVector<Region *> &RQ) { + RQ.push_back(&R); + for (const auto &E : R) + addRegionIntoQueue(*E, RQ); +} + +/// The phase pipeline of Polly to be embedded into another pass manager than +/// runs passes on functions. +/// +/// Polly holds state besides LLVM-IR (RegionInfo and ScopInfo) between phases +/// that LLVM pass managers do not consider when scheduling analyses and passes. +/// That is, the ScopInfo must persist between phases that a pass manager must +/// not invalidate to recompute later. +class PhaseManager { +private: + Function &F; + FunctionAnalysisManager &FAM; + PollyPassOptions Opts; + +public: + PhaseManager(Function &F, FunctionAnalysisManager &FAM, PollyPassOptions Opts) + : F(F), FAM(FAM), Opts(std::move(Opts)) {} + + /// Execute Polly's phases as indicated by the options. + bool run() { + // Get analyses from the function pass manager. + // These must be preserved during all phases so that if processing one SCoP + // has finished, the next SCoP can still use them. Recomputing is not an + // option because ScopDetection stores references to the old results. + // TODO: CodePreparation doesn't actually need these analysis, it just keeps + // them up-to-date. If they are not computed yet, can also compute after the + // prepare phase. + LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + bool ModifiedIR = false; + + // Phase: prepare + // TODO: Setting ModifiedIR will invalidate any analysis, even if DT, LI are + // preserved. + if (Opts.isPhaseEnabled(PassPhase::Prepare)) { + PreservedAnalyses PA = CodePreparationPass().run(F, FAM); + FAM.invalidate(F, PA); + if (!PA.areAllPreserved()) + ModifiedIR = true; + } + + // Can't do anything without detection + if (!Opts.isPhaseEnabled(PassPhase::Detection)) + return false; + + AAResults &AA = FAM.getResult<AAManager>(F); + ScalarEvolution &SE = FAM.getResult<ScalarEvolutionAnalysis>(F); + OptimizationRemarkEmitter &ORE = + FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); + + // ScopDetection is modifying RegionInfo, do not cache it, nor use a cached + // version. + RegionInfo RI = RegionInfoAnalysis().run(F, FAM); + + // Phase: detection + ScopDetection SD(DT, SE, LI, RI, AA, ORE); + SD.detect(F); + if (Opts.isPhaseEnabled(PassPhase::PrintDetect)) { + outs() << "Detected Scops in Function " << F.getName() << "\n"; + for (const Region *R : SD.ValidRegions) + outs() << "Valid Region for Scop: " << R->getNameStr() << '\n'; + outs() << "\n"; + } + + if (Opts.isPhaseEnabled(PassPhase::DotScops)) + printGraphForFunction(F, &SD, "scops", false); + if (Opts.isPhaseEnabled(PassPhase::DotScopsOnly)) + printGraphForFunction(F, &SD, "scopsonly", true); + + auto ViewScops = [&](const char *Name, bool IsSimply) { + if (Opts.ViewFilter.empty() && !F.getName().count(Opts.ViewFilter)) + return; + + if (Opts.ViewAll || std::distance(SD.begin(), SD.end()) > 0) + viewGraphForFunction(F, &SD, Name, IsSimply); + }; + if (Opts.isPhaseEnabled(PassPhase::ViewScops)) + ViewScops("scops", false); + if (Opts.isPhaseEnabled(PassPhase::ViewScopsOnly)) + ViewScops("scopsonly", true); + + // Phase: scops + AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F); + const DataLayout &DL = F.getParent()->getDataLayout(); + ScopInfo Info(DL, SD, SE, LI, AA, DT, AC, ORE); + if (Opts.isPhaseEnabled(PassPhase::PrintScopInfo)) { + if (Region *TLR = RI.getTopLevelRegion()) { + SmallVector<Region *> Regions; + addRegionIntoQueue(*TLR, Regions); + + // reverse iteration because the regression tests expect it. + for (Region *R : reverse(Regions)) { + Scop *S = Info.getScop(R); + outs() << "Printing analysis 'Polly - Create polyhedral " + "description of Scops' for region: '" + << R->getNameStr() << "' in function '" << F.getName() + << "':\n"; + if (S) + outs() << *S; + else + outs() << "Invalid Scop!\n"; + } + } + } + + SmallPriorityWorklist<Region *, 4> Worklist; + for (auto &[R, S] : Info) + if (S) + Worklist.insert(R); + + TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); + while (!Worklist.empty()) { + Region *R = Worklist.pop_back_val(); + if (!SD.isMaxRegionInScop(*R, /*Verify=*/false)) + continue; + Scop *S = Info.getScop(R); + + // Phase: flatten + if (Opts.isPhaseEnabled(PassPhase::Flatten)) + runFlattenSchedulePass(*S); + + // Phase: deps + // Actual analysis runs on-demand, so it does not matter whether the phase + // is actually enabled, but use this location to print dependencies. + DependenceAnalysis::Result DA = runDependenceAnalysis(*S); + if (Opts.isPhaseEnabled(PassPhase::PrintDependences)) { + assert(Opts.isPhaseEnabled(PassPhase::Dependences)); + const Dependences &D = DA.getDependences(Opts.PrintDepsAnalysisLevel); + D.print(outs()); + } + + // Phase: import-jscop + if (Opts.isPhaseEnabled(PassPhase::ImportJScop)) + runImportJSON(*S, DA); + + // Phase: simplify-0 + bool ModifiedSinceSimplify = true; + if (Opts.isPhaseEnabled(PassPhase::Simplify0)) { + runSimplify(*S, 0); + ModifiedSinceSimplify = false; + } + + // Phase: optree + if (Opts.isPhaseEnabled(PassPhase::Optree)) { + bool ModifiedByOptree = runForwardOpTree(*S); + ModifiedSinceSimplify |= ModifiedByOptree; + } + + // Phase: delicm + if (Opts.isPhaseEnabled(PassPhase::DeLICM)) { + bool ModifiedByDelicm = runDeLICM(*S); + ModifiedSinceSimplify |= ModifiedByDelicm; + } + + // Phase: simplify-1 + // If we have already run simplify-0, do not re-run it if the SCoP has not + // changed since then. + if (ModifiedSinceSimplify && Opts.isPhaseEnabled(PassPhase::Simplify1)) { + runSimplify(*S, 1); + ModifiedSinceSimplify = false; + } + + // Phase: dce + if (Opts.isPhaseEnabled(PassPhase::DeadCodeElimination)) + runDeadCodeElim(*S, DA); + + // Phase: mse + if (Opts.isPhaseEnabled(PassPhase::MaximumStaticExtension)) + runMaximalStaticExpansion(*S, DA); + + // Phase: prune + if (Opts.isPhaseEnabled(PassPhase::PruneUnprofitable)) + runPruneUnprofitable(*S); + + // Phase: opt-isl + if (Opts.isPhaseEnabled(PassPhase::Optimization)) + runIslScheduleOptimizer(*S, &TTI, DA); + + // Phase: import-jscop + if (Opts.isPhaseEnabled(PassPhase::ExportJScop)) + runExportJSON(*S); + + // Phase: ast + // Cannot run codegen unless ast is enabled + if (!Opts.isPhaseEnabled(PassPhase::AstGen)) + continue; + std::unique_ptr<IslAstInfo> IslAst = runIslAstGen(*S, DA); + + // Phase: codegen + if (!Opts.isPhaseEnabled(PassPhase::CodeGen)) + continue; + bool ModifiedByCodeGen = runCodeGeneration(*S, RI, *IslAst); + if (ModifiedByCodeGen) { + ModifiedIR = true; + + // For all regions, create new polly::Scop objects because the old ones + // refere to invalidated LLVM-IR. + // FIXME: Adds all SCoPs again to statistics + Info.recompute(); + } + } + + return ModifiedIR; + } +}; +} // namespace + +StringRef polly::getPhaseName(PassPhase Phase) { + switch (Phase) { + case PassPhase::Prepare: + return "prepare"; + case PassPhase::Detection: + return "detect"; + case PassPhase::PrintDetect: + return "print-detect"; + case PassPhase::DotScops: + return "dot-scops"; + case PassPhase::DotScopsOnly: + return "dot-scops-only"; + case PassPhase::ViewScops: + return "view-scops"; + case PassPhase::ViewScopsOnly: + return "view-scops-only"; + case PassPhase::ScopInfo: + return "scops"; + case PassPhase::PrintScopInfo: + return "print-scops"; + case PassPhase::Flatten: + return "flatten"; + case PassPhase::Dependences: + return "deps"; + case PassPhase::PrintDependences: + return "print-deps"; + case PassPhase::ImportJScop: + return "import-jscop"; + case PassPhase::Simplify0: + return "simplify-0"; + case PassPhase::Optree: + return "optree"; + case PassPhase::DeLICM: + return "delicm"; + case PassPhase::Simplify1: + return "simplify-1"; + case PassPhase::DeadCodeElimination: + return "dce"; + case PassPhase::MaximumStaticExtension: + return "mse"; + case PassPhase::PruneUnprofitable: + return "prune"; + case PassPhase::Optimization: + return "opt-isl"; // "opt" would conflict with the llvm executable + case PassPhase::ExportJScop: + return "export-jscop"; + case PassPhase::AstGen: + return "ast"; + case PassPhase::CodeGen: + return "codegen"; + default: + llvm_unreachable("Unexpected phase"); + } +} + +PassPhase polly::parsePhase(StringRef Name) { + return StringSwitch<PassPhase>(Name) + .Case("prepare", PassPhase::Prepare) + .Case("detect", PassPhase::Detection) + .Case("print-detect", PassPhase::PrintDetect) + .Case("dot-scops", PassPhase::DotScops) + .Case("dot-scops-only", PassPhase::DotScopsOnly) + .Case("view-scops", PassPhase::ViewScops) + .Case("view-scops-only", PassPhase::ViewScopsOnly) + .Case("scops", PassPhase::ScopInfo) + .Case("print-scops", PassPhase::PrintScopInfo) + .Case("flatten", PassPhase::Flatten) + .Case("deps", PassPhase::Dependences) + .Case("print-deps", PassPhase::PrintDependences) + .Case("import-jscop", PassPhase::ImportJScop) + .Case("simplify-0", PassPhase::Simplify0) + .Case("optree", PassPhase::Optree) + .Case("delicm", PassPhase::DeLICM) + .Case("simplify-1", PassPhase::Simplify1) + .Case("dce", PassPhase::DeadCodeElimination) + .Case("mse", PassPhase::MaximumStaticExtension) + .Case("prune", PassPhase::PruneUnprofitable) + .Case("opt-isl", PassPhase::Optimization) + .Case("export-jscop", PassPhase::ExportJScop) + .Case("ast", PassPhase::AstGen) + .Case("codegen", PassPhase::CodeGen) + .Default(PassPhase::None); +} + +bool polly::dependsOnDependenceInfo(PassPhase Phase) { + // Nothing before dep phase can depend on it + if (static_cast<size_t>(Phase) <= static_cast<size_t>(PassPhase::Dependences)) + return false; + + switch (Phase) { + case PassPhase::Simplify0: + case PassPhase::Optree: + case PassPhase::DeLICM: + case PassPhase::Simplify1: + case PassPhase::PruneUnprofitable: + case PassPhase::ImportJScop: + case PassPhase::ExportJScop: + case PassPhase::AstGen: // transitively through codegen + case PassPhase::CodeGen: + return false; + default: + return true; + } +} + +void PollyPassOptions::enableEnd2End() { + setPhaseEnabled(PassPhase::Detection); + setPhaseEnabled(PassPhase::ScopInfo); + setPhaseEnabled(PassPhase::Dependences); + setPhaseEnabled(PassPhase::AstGen); + setPhaseEnabled(PassPhase::CodeGen); +} + +void PollyPassOptions::enableDefaultOpts() { + setPhaseEnabled(PassPhase::Prepare); + setPhaseEnabled(PassPhase::Simplify0); + setPhaseEnabled(PassPhase::Optree); + setPhaseEnabled(PassPhase::DeLICM); + setPhaseEnabled(PassPhase::Simplify1); + setPhaseEnabled(PassPhase::PruneUnprofitable); + setPhaseEnabled(PassPhase::Optimization); +} + +void PollyPassOptions::disableAfter(PassPhase Phase) { + assert(Phase != PassPhase::None); + for (PassPhase P : enum_seq_inclusive(Phase, PassPhase::PassPhaseLast)) { + if (P == Phase) + continue; + setPhaseEnabled(P, false); + } +} + +Error PollyPassOptions::checkConsistency() const { + for (PassPhase P : enum_seq_inclusive(PassPhase::PassPhaseFirst, + PassPhase::PassPhaseLast)) { + if (!isPhaseEnabled(P)) + continue; + + // Prepare and Detection have no requirements + if (P == PassPhase::Prepare || P == PassPhase::Detection) + continue; + + if (!isPhaseEnabled(PassPhase::Detection)) + return make_error<StringError>( + formatv("'{0}' requires 'detect' to be enabled", getPhaseName(P)) + .str(), + inconvertibleErrorCode()); + + if (static_cast<size_t>(P) < static_cast<size_t>(PassPhase::ScopInfo)) + continue; + + if (!isPhaseEnabled(PassPhase::ScopInfo)) + return make_error<StringError>( + formatv("'{0}' requires 'scops' to be enabled", getPhaseName(P)) + .str(), + inconvertibleErrorCode()); + + if (dependsOnDependenceInfo(P) && !isPhaseEnabled(PassPhase::Dependences)) + return make_error<StringError>( + formatv("'{0}' requires 'deps' to be enabled", getPhaseName(P)).str(), + inconvertibleErrorCode()); + } + + if (isPhaseEnabled(PassPhase::CodeGen) && !isPhaseEnabled(PassPhase::AstGen)) + return make_error<StringError>("'codegen' requires 'ast' to be enabled", + inconvertibleErrorCode()); + + return Error::success(); +} + +bool polly::runPollyPass(Function &F, FunctionAnalysisManager &FAM, + PollyPassOptions Opts) { + return PhaseManager(F, FAM, std::move(Opts)).run(); +} diff --git a/polly/lib/Pass/PollyFunctionPass.cpp b/polly/lib/Pass/PollyFunctionPass.cpp new file mode 100644 index 0000000000000..a478e4df2ca20 --- /dev/null +++ b/polly/lib/Pass/PollyFunctionPass.cpp @@ -0,0 +1,22 @@ +//===------ PollyFunctionPass.cpp - Polly function pass ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "polly/Pass/PollyFunctionPass.h" + +using namespace llvm; +using namespace polly; + +PreservedAnalyses PollyFunctionPass::run(llvm::Function &F, + llvm::FunctionAnalysisManager &FAM) { + bool ModifiedIR = runPollyPass(F, FAM, Opts); + + // Be conservative about preserved analyses. + // FIXME: May also need to invalidate/update Module/CGSCC passes, but cannot + // reach them within a FunctionPassManager. + return ModifiedIR ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/polly/lib/Pass/PollyModulePass.cpp b/polly/lib/Pass/PollyModulePass.cpp new file mode 100644 index 0000000000000..f56ee672b76af --- /dev/null +++ b/polly/lib/Pass/PollyModulePass.cpp @@ -0,0 +1,29 @@ +//===------ PollyModulePass.cpp - Polly module pass ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "polly/Pass/PollyModulePass.h" +#include "llvm/IR/Module.h" + +using namespace llvm; +using namespace polly; + +PreservedAnalyses PollyModulePass::run(llvm::Module &M, + llvm::ModuleAnalysisManager &MAM) { + FunctionAnalysisManager &FAM = + MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + + bool ModifiedAnyIR = false; + for (Function &F : M) { + bool LocalModifiedIR = runPollyPass(F, FAM, Opts); + ModifiedAnyIR |= LocalModifiedIR; + } + + // Be conservative about preserved analyses, especially if parallel functions + // have been outlined. + return ModifiedAnyIR ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/polly/lib/Support/DumpFunctionPass.cpp b/polly/lib/Support/DumpFunctionPass.cpp index e47b7fe0db966..9565e2156aee6 100644 --- a/polly/lib/Support/DumpFunctionPass.cpp +++ b/polly/lib/Support/DumpFunctionPass.cpp @@ -13,7 +13,6 @@ #include "polly/Support/DumpFunctionPass.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassInstrumentation.h" -#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -82,50 +81,10 @@ static void runDumpFunction(llvm::Function &F, StringRef Suffix) { Out->keep(); LLVM_DEBUG(dbgs() << "Dump file " << Dumpfile << " written successfully\n"); } - -class DumpFunctionWrapperPass final : public FunctionPass { -private: - DumpFunctionWrapperPass(const DumpFunctionWrapperPass &) = delete; - const DumpFunctionWrapperPass & - operator=(const DumpFunctionWrapperPass &) = delete; - - std::string Suffix; - -public: - static char ID; - - explicit DumpFunctionWrapperPass() : FunctionPass(ID), Suffix("-dump") {} - - explicit DumpFunctionWrapperPass(std::string Suffix) - : FunctionPass(ID), Suffix(std::move(Suffix)) {} - - /// @name FunctionPass interface - //@{ - void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { - AU.setPreservesAll(); - } - - bool runOnFunction(llvm::Function &F) override { - runDumpFunction(F, Suffix); - return false; - } - //@} -}; - -char DumpFunctionWrapperPass::ID; } // namespace -FunctionPass *polly::createDumpFunctionWrapperPass(std::string Suffix) { - return new DumpFunctionWrapperPass(std::move(Suffix)); -} - llvm::PreservedAnalyses DumpFunctionPass::run(Function &F, FunctionAnalysisManager &AM) { runDumpFunction(F, Suffix); return PreservedAnalyses::all(); } - -INITIALIZE_PASS_BEGIN(DumpFunctionWrapperPass, "polly-dump-function", - "Polly - Dump Function", false, false) -INITIALIZE_PASS_END(DumpFunctionWrapperPass, "polly-dump-function", - "Polly - Dump Function", false, false) diff --git a/polly/lib/Support/DumpModulePass.cpp b/polly/lib/Support/DumpModulePass.cpp index c1c27ef6ac757..2eaa0707fe571 100644 --- a/polly/lib/Support/DumpModulePass.cpp +++ b/polly/lib/Support/DumpModulePass.cpp @@ -12,7 +12,6 @@ #include "polly/Support/DumpModulePass.h" #include "llvm/IR/Module.h" -#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -47,56 +46,10 @@ static void runDumpModule(llvm::Module &M, StringRef Filename, bool IsSuffix) { M.print(Out->os(), nullptr); Out->keep(); } - -class DumpModuleWrapperPass final : public ModulePass { -private: - DumpModuleWrapperPass(const DumpModuleWrapperPass &) = delete; - const DumpModuleWrapperPass & - operator=(const DumpModuleWrapperPass &) = delete; - - std::string Filename; - bool IsSuffix; - -public: - static char ID; - - /// This constructor is used e.g. if using opt -polly-dump-module. - /// - /// Provide a default suffix to not overwrite the original file. - explicit DumpModuleWrapperPass() - : ModulePass(ID), Filename("-dump"), IsSuffix(true) {} - - explicit DumpModuleWrapperPass(std::string Filename, bool IsSuffix) - : ModulePass(ID), Filename(std::move(Filename)), IsSuffix(IsSuffix) {} - - /// @name ModulePass interface - //@{ - void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { - AU.setPreservesAll(); - } - - bool runOnModule(llvm::Module &M) override { - runDumpModule(M, Filename, IsSuffix); - return false; - } - //@} -}; - -char DumpModuleWrapperPass::ID; } // namespace -ModulePass *polly::createDumpModuleWrapperPass(std::string Filename, - bool IsSuffix) { - return new DumpModuleWrapperPass(std::move(Filename), IsSuffix); -} - llvm::PreservedAnalyses DumpModulePass::run(llvm::Module &M, llvm::ModuleAnalysisManager &AM) { runDumpModule(M, Filename, IsSuffix); return PreservedAnalyses::all(); } - -INITIALIZE_PASS_BEGIN(DumpModuleWrapperPass, "polly-dump-module", - "Polly - Dump Module", false, false) -INITIALIZE_PASS_END(DumpModuleWrapperPass, "polly-dump-module", - "Polly - Dump Module", false, false) diff --git a/polly/lib/Support/PollyPasses.def b/polly/lib/Support/PollyPasses.def index 2c792a5867100..496839760a844 100644 --- a/polly/lib/Support/PollyPasses.def +++ b/polly/lib/Support/PollyPasses.def @@ -1,3 +1,10 @@ +#ifndef MODULE_PASS +#define MODULE_PASS(NAME, CREATE_PASS, PARSER) +#endif +MODULE_PASS("polly", createModuleToFunctionPassAdaptor(PollyFunctionPass(Opts)), parsePollyDefaultOptions) +MODULE_PASS("polly-custom", createModuleToFunctionPassAdaptor(PollyFunctionPass(Opts)), parsePollyCustomOptions) +#undef MODULE_PASS + #ifndef CGSCC_PASS #define CGSCC_PASS(NAME, CREATE_PASS, PARSER) #endif @@ -12,15 +19,17 @@ FUNCTION_ANALYSIS("polly-function-scops", ScopInfoAnalysis()) #undef FUNCTION_ANALYSIS #ifndef FUNCTION_PASS -#define FUNCTION_PASS(NAME, CREATE_PASS) +#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER) #endif -FUNCTION_PASS("polly-prepare", CodePreparationPass()) -FUNCTION_PASS("print<polly-detect>", ScopAnalysisPrinterPass(llvm::errs())) -FUNCTION_PASS("print<polly-function-scops>", ScopInfoPrinterPass(llvm::errs())) -FUNCTION_PASS("polly-scop-viewer", ScopViewer()) -FUNCTION_PASS("polly-scop-only-viewer", ScopOnlyViewer()) -FUNCTION_PASS("polly-scop-printer", ScopPrinter()) -FUNCTION_PASS("polly-scop-only-printer", ScopOnlyPrinter()) +FUNCTION_PASS("polly-prepare", CodePreparationPass(), parseNoOptions) +FUNCTION_PASS("print<polly-detect>", ScopAnalysisPrinterPass(llvm::errs()), parseNoOptions) +FUNCTION_PASS("print<polly-function-scops>", ScopInfoPrinterPass(llvm::errs()), parseNoOptions) +FUNCTION_PASS("polly-scop-viewer", ScopViewer(), parseNoOptions) +FUNCTION_PASS("polly-scop-only-viewer", ScopOnlyViewer(), parseNoOptions) +FUNCTION_PASS("polly-scop-printer", ScopPrinter(), parseNoOptions) +FUNCTION_PASS("polly-scop-only-printer", ScopOnlyPrinter(), parseNoOptions) +FUNCTION_PASS("polly", PollyFunctionPass(Opts), parsePollyDefaultOptions) +FUNCTION_PASS("polly-custom", PollyFunctionPass(Opts), parsePollyCustomOptions) #undef FUNCTION_PASS #ifndef SCOP_ANALYSIS diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp index 04f8715502c38..2f1d7a8362349 100644 --- a/polly/lib/Support/RegisterPasses.cpp +++ b/polly/lib/Support/RegisterPasses.cpp @@ -28,8 +28,9 @@ #include "polly/DependenceInfo.h" #include "polly/ForwardOpTree.h" #include "polly/JSONExporter.h" -#include "polly/LinkAllPasses.h" #include "polly/MaximalStaticExpansion.h" +#include "polly/Options.h" +#include "polly/Pass/PollyFunctionPass.h" #include "polly/PruneUnprofitable.h" #include "polly/ScheduleOptimizer.h" #include "polly/ScopDetection.h" @@ -52,6 +53,8 @@ #include "llvm/Transforms/IPO.h" using namespace llvm; +using namespace polly; + namespace cl = llvm::cl; using namespace polly; @@ -201,58 +204,19 @@ static cl::opt<bool> EnablePruneUnprofitable( cl::desc("Bail out on unprofitable SCoPs before rescheduling"), cl::Hidden, cl::init(true), cl::cat(PollyCategory)); -namespace { +static cl::opt<bool> + PollyPrintDetect("polly-print-detect", + cl::desc("Polly - Print static control parts (SCoPs)"), + cl::cat(PollyCategory)); -/// Initialize Polly passes when library is loaded. -/// -/// We use the constructor of a statically declared object to initialize the -/// different Polly passes right after the Polly library is loaded. This ensures -/// that the Polly passes are available e.g. in the 'opt' tool. -struct StaticInitializer { - StaticInitializer() { - llvm::PassRegistry &Registry = *llvm::PassRegistry::getPassRegistry(); - polly::initializePollyPasses(Registry); - } -}; -static StaticInitializer InitializeEverything; -} // end of anonymous namespace. - -void initializePollyPasses(llvm::PassRegistry &Registry) { - initializeCodeGenerationPass(Registry); - - initializeCodePreparationPass(Registry); - initializeDeadCodeElimWrapperPassPass(Registry); - initializeDependenceInfoPass(Registry); - initializeDependenceInfoPrinterLegacyPassPass(Registry); - initializeDependenceInfoWrapperPassPass(Registry); - initializeDependenceInfoPrinterLegacyFunctionPassPass(Registry); - initializeJSONExporterPass(Registry); - initializeJSONImporterPass(Registry); - initializeJSONImporterPrinterLegacyPassPass(Registry); - initializeMaximalStaticExpanderWrapperPassPass(Registry); - initializeIslAstInfoWrapperPassPass(Registry); - initializeIslAstInfoPrinterLegacyPassPass(Registry); - initializeIslScheduleOptimizerWrapperPassPass(Registry); - initializeIslScheduleOptimizerPrinterLegacyPassPass(Registry); - initializePollyCanonicalizePass(Registry); - initializeScopDetectionWrapperPassPass(Registry); - initializeScopDetectionPrinterLegacyPassPass(Registry); - initializeScopInlinerWrapperPassPass(Registry); - initializeScopInfoRegionPassPass(Registry); - initializeScopInfoPrinterLegacyRegionPassPass(Registry); - initializeScopInfoWrapperPassPass(Registry); - initializeScopInfoPrinterLegacyFunctionPassPass(Registry); - initializeFlattenSchedulePass(Registry); - initializeFlattenSchedulePrinterLegacyPassPass(Registry); - initializeForwardOpTreeWrapperPassPass(Registry); - initializeForwardOpTreePrinterLegacyPassPass(Registry); - initializeDeLICMWrapperPassPass(Registry); - initializeDeLICMPrinterLegacyPassPass(Registry); - initializeSimplifyWrapperPassPass(Registry); - initializeSimplifyPrinterLegacyPassPass(Registry); - initializeDumpModuleWrapperPassPass(Registry); - initializePruneUnprofitableWrapperPassPass(Registry); -} +static cl::opt<bool> + PollyPrintScops("polly-print-scops", + cl::desc("Print polyhedral description of all regions"), + cl::cat(PollyCategory)); + +static cl::opt<bool> PollyPrintDeps("polly-print-deps", + cl::desc("Polly - Print dependences"), + cl::cat(PollyCategory)); static bool shouldEnablePollyForOptimization() { return PollyEnabled; } @@ -266,6 +230,198 @@ static bool shouldEnablePollyForDiagnostic() { ExportJScop; } +/// Parser of parameters for LoopVectorize pass. +static llvm::Expected<PollyPassOptions> parsePollyOptions(StringRef Params, + bool IsCustom) { + PassPhase PrevPhase = PassPhase::None; + + bool EnableDefaultOpts = !IsCustom; + bool EnableEnd2End = !IsCustom; + std::optional<bool> + PassEnabled[static_cast<size_t>(PassPhase::PassPhaseLast) + 1]; + PassPhase StopAfter = PassPhase::None; + + // Passes enabled using command-line flags (can be overridden using + // 'polly<no-pass>') + if (PollyPrintDetect) + PassEnabled[static_cast<size_t>(PassPhase::PrintDetect)] = true; + if (PollyPrintScops) + PassEnabled[static_cast<size_t>(PassPhase::PrintScopInfo)] = true; + if (PollyPrintDeps) + PassEnabled[static_cast<size_t>(PassPhase::PrintDependences)] = true; + + if (PollyViewer) + PassEnabled[static_cast<size_t>(PassPhase::ViewScops)] = true; + if (PollyOnlyViewer) + PassEnabled[static_cast<size_t>(PassPhase::ViewScopsOnly)] = true; + if (PollyPrinter) + PassEnabled[static_cast<size_t>(PassPhase::DotScops)] = true; + if (PollyOnlyPrinter) + PassEnabled[static_cast<size_t>(PassPhase::DotScopsOnly)] = true; + if (!EnableSimplify) + PassEnabled[static_cast<size_t>(PassPhase::Simplify0)] = false; + if (!EnableForwardOpTree) + PassEnabled[static_cast<size_t>(PassPhase::Optree)] = false; + if (!EnableDeLICM) + PassEnabled[static_cast<size_t>(PassPhase::DeLICM)] = false; + if (!EnableSimplify) + PassEnabled[static_cast<size_t>(PassPhase::Simplify1)] = false; + if (ImportJScop) + PassEnabled[static_cast<size_t>(PassPhase::ImportJScop)] = true; + if (DeadCodeElim) + PassEnabled[static_cast<size_t>(PassPhase::DeadCodeElimination)] = true; + if (FullyIndexedStaticExpansion) + PassEnabled[static_cast<size_t>(PassPhase::MaximumStaticExtension)] = true; + if (!EnablePruneUnprofitable) + PassEnabled[static_cast<size_t>(PassPhase::PruneUnprofitable)] = false; + switch (Optimizer) { + case OPTIMIZER_NONE: + // explicitly switched off + PassEnabled[static_cast<size_t>(PassPhase::Optimization)] = false; + break; + case OPTIMIZER_ISL: + // default: enabled + break; + } + if (ExportJScop) + PassEnabled[static_cast<size_t>(PassPhase::ExportJScop)] = true; + switch (CodeGeneration) { + case CODEGEN_AST: + PassEnabled[static_cast<size_t>(PassPhase::AstGen)] = true; + PassEnabled[static_cast<size_t>(PassPhase::CodeGen)] = false; + break; + case CODEGEN_FULL: + // default: ast and codegen enabled + break; + case CODEGEN_NONE: + PassEnabled[static_cast<size_t>(PassPhase::AstGen)] = false; + PassEnabled[static_cast<size_t>(PassPhase::CodeGen)] = false; + break; + } + + while (!Params.empty()) { + StringRef Param; + std::tie(Param, Params) = Params.split(';'); + auto [ParamName, ParamVal] = Param.split('='); + + if (ParamName == "stopafter") { + StopAfter = parsePhase(ParamVal); + if (StopAfter == PassPhase::None) + return make_error<StringError>( + formatv("invalid stopafter parameter value '{0}'", ParamVal).str(), + inconvertibleErrorCode()); + continue; + } + + if (!ParamVal.empty()) + return make_error<StringError>( + formatv("parameter '{0}' does not take value", ParamName).str(), + inconvertibleErrorCode()); + + bool Enabled = true; + if (ParamName.starts_with("no-")) { + Enabled = false; + ParamName = ParamName.drop_front(3); + } + + if (ParamName == "default-opts") { + EnableDefaultOpts = Enabled; + continue; + } + + if (ParamName == "end2end") { + EnableEnd2End = Enabled; + continue; + } + + PassPhase Phase; + + // Shortcut for both simplifys at the same time + if (ParamName == "simplify") { + PassEnabled[static_cast<size_t>(PassPhase::Simplify0)] = Enabled; + PassEnabled[static_cast<size_t>(PassPhase::Simplify1)] = Enabled; + Phase = PassPhase::Simplify0; + } else { + Phase = parsePhase(ParamName); + if (Phase == PassPhase::None) + return make_error<StringError>( + formatv("invalid Polly parameter/phase name '{0}'", ParamName) + .str(), + inconvertibleErrorCode()); + + if (PrevPhase >= Phase) + return make_error<StringError>( + formatv("phases must not be repeated and enumerated in-order: " + "'{0}' listed before '{1}'", + getPhaseName(PrevPhase), getPhaseName(Phase)) + .str(), + inconvertibleErrorCode()); + + PassEnabled[static_cast<size_t>(Phase)] = Enabled; + } + PrevPhase = Phase; + } + + PollyPassOptions Opts; + Opts.ViewAll = ViewAll; + Opts.ViewFilter = ViewFilter; + Opts.PrintDepsAnalysisLevel = OptAnalysisLevel; + + // Implicitly enable dependent phases first. May be overriden explicitly + // on/off later. + for (PassPhase P : llvm::enum_seq_inclusive(PassPhase::PassPhaseFirst, + PassPhase::PassPhaseLast)) { + bool Enabled = PassEnabled[static_cast<size_t>(P)].value_or(false); + if (!Enabled) + continue; + + if (static_cast<size_t>(PassPhase::Detection) < static_cast<size_t>(P)) + Opts.setPhaseEnabled(PassPhase::Detection); + + if (static_cast<size_t>(PassPhase::ScopInfo) < static_cast<size_t>(P)) + Opts.setPhaseEnabled(PassPhase::ScopInfo); + + if (dependsOnDependenceInfo(P)) + Opts.setPhaseEnabled(PassPhase::Dependences); + + if (static_cast<size_t>(PassPhase::AstGen) < static_cast<size_t>(P)) + Opts.setPhaseEnabled(PassPhase::AstGen); + } + + if (EnableEnd2End) + Opts.enableEnd2End(); + + if (EnableDefaultOpts) + Opts.enableDefaultOpts(); + + for (PassPhase P : llvm::enum_seq_inclusive(PassPhase::PassPhaseFirst, + PassPhase::PassPhaseLast)) { + std::optional<bool> Enabled = PassEnabled[static_cast<size_t>(P)]; + + // Apply only if set explicitly. + if (Enabled.has_value()) + Opts.setPhaseEnabled(P, *Enabled); + } + + if (StopAfter != PassPhase::None) + Opts.disableAfter(StopAfter); + + if (Error CheckResult = Opts.checkConsistency()) + return CheckResult; + + return Opts; +} + +static llvm::Expected<PollyPassOptions> +parsePollyDefaultOptions(StringRef Params) { + return parsePollyOptions(Params, false); +} + +static llvm::Expected<PollyPassOptions> +parsePollyCustomOptions(StringRef Params) { + return parsePollyOptions(Params, true); +} + /// Register Polly passes such that they form a polyhedral optimizer. /// /// The individual Polly passes are registered in the pass manager such that @@ -305,77 +461,12 @@ static void buildCommonPollyPipeline(FunctionPassManager &PM, OptimizationLevel Level, bool EnableForOpt) { PassBuilder PB; - ScopPassManager SPM; - - PM.addPass(CodePreparationPass()); - - // TODO add utility passes for the various command line options, once they're - // ported - - if (PollyDetectOnly) { - // Don't add more passes other than the ScopPassManager's detection passes. - PM.addPass(createFunctionToScopPassAdaptor(std::move(SPM))); - return; - } - if (PollyViewer) - PM.addPass(ScopViewer()); - if (PollyOnlyViewer) - PM.addPass(ScopOnlyViewer()); - if (PollyPrinter) - PM.addPass(ScopPrinter()); - if (PollyOnlyPrinter) - PM.addPass(ScopOnlyPrinter()); - if (EnableSimplify) - SPM.addPass(SimplifyPass(0)); - if (EnableForwardOpTree) - SPM.addPass(ForwardOpTreePass()); - if (EnableDeLICM) - SPM.addPass(DeLICMPass()); - if (EnableSimplify) - SPM.addPass(SimplifyPass(1)); - - if (ImportJScop) - SPM.addPass(JSONImportPass()); - - if (DeadCodeElim) - SPM.addPass(DeadCodeElimPass()); - - if (FullyIndexedStaticExpansion) - SPM.addPass(MaximalStaticExpansionPass()); - - if (EnablePruneUnprofitable) - SPM.addPass(PruneUnprofitablePass()); - - switch (Optimizer) { - case OPTIMIZER_NONE: - break; /* Do nothing */ - case OPTIMIZER_ISL: - SPM.addPass(IslScheduleOptimizerPass()); - break; - } - - if (ExportJScop) - SPM.addPass(JSONExportPass()); - - if (!EnableForOpt) - return; - - switch (CodeGeneration) { - case CODEGEN_AST: - SPM.addPass( - llvm::RequireAnalysisPass<IslAstAnalysis, Scop, ScopAnalysisManager, - ScopStandardAnalysisResults &, - SPMUpdater &>()); - break; - case CODEGEN_FULL: - SPM.addPass(CodeGenerationPass()); - break; - case CODEGEN_NONE: - break; - } + ExitOnError Err("Inconsistent Polly configuration: "); + PollyPassOptions &&Opts = + Err(parsePollyOptions(StringRef(), /*IsCustom=*/false)); + PM.addPass(PollyFunctionPass(Opts)); - PM.addPass(createFunctionToScopPassAdaptor(std::move(SPM))); PM.addPass(PB.buildFunctionSimplificationPipeline( Level, llvm::ThinOrFullLTOPhase::None)); // Cleanup @@ -492,8 +583,9 @@ parseCGPipeline(StringRef Name, llvm::CGSCCPassManager &CGPM, return false; } -static bool +static llvm::Expected<bool> parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM, + PassInstrumentationCallbacks *PIC, ArrayRef<PassBuilder::PipelineElement> Pipeline) { if (llvm::parseAnalysisUtilityPasses<OwningScopAnalysisManagerFunctionProxy>( "polly-scop-analyses", Name, FPM)) @@ -505,8 +597,13 @@ parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM, FPM)) \ return true; -#define FUNCTION_PASS(NAME, CREATE_PASS) \ - if (Name == NAME) { \ +#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER) \ + if (PassBuilder::checkParametrizedPassName(Name, NAME)) { \ + auto ExpectedOpts = PassBuilder::parsePassParameters(PARSER, Name, NAME); \ + if (!ExpectedOpts) \ + return ExpectedOpts.takeError(); \ + auto &&Opts = *ExpectedOpts; \ + (void)Opts; \ FPM.addPass(CREATE_PASS); \ return true; \ } @@ -592,6 +689,28 @@ parseTopLevelPipeline(llvm::ModulePassManager &MPM, return true; } +static llvm::Expected<bool> +parseModulePipeline(StringRef Name, llvm::ModulePassManager &MPM, + PassInstrumentationCallbacks *PIC, + ArrayRef<PassBuilder::PipelineElement> Pipeline) { + assert(Pipeline.empty()); + +#define MODULE_PASS(NAME, CREATE_PASS, PARSER) \ + if (PassBuilder::checkParametrizedPassName(Name, NAME)) { \ + auto ExpectedOpts = PassBuilder::parsePassParameters(PARSER, Name, NAME); \ + if (!ExpectedOpts) \ + return ExpectedOpts.takeError(); \ + auto &&Opts = *ExpectedOpts; \ + (void)Opts; \ + MPM.addPass(CREATE_PASS); \ + return true; \ + } + +#include "PollyPasses.def" + + return false; +} + /// Register Polly to be available as an optimizer /// /// @@ -620,10 +739,36 @@ parseTopLevelPipeline(llvm::ModulePassManager &MPM, /// handle LICMed code to make it useful. void registerPollyPasses(PassBuilder &PB) { PassInstrumentationCallbacks *PIC = PB.getPassInstrumentationCallbacks(); + +#define MODULE_PASS(NAME, CREATE_PASS, PARSER) \ + { \ + std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts; \ + (void)Opts; \ + PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); \ + } +#define CGSCC_PASS(NAME, CREATE_PASS, PARSER) \ + { \ + std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts; \ + (void)Opts; \ + PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); \ + } +#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER) \ + { \ + std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts; \ + (void)Opts; \ + PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); \ + } +#include "PollyPasses.def" + PB.registerAnalysisRegistrationCallback([PIC](FunctionAnalysisManager &FAM) { registerFunctionAnalyses(FAM, PIC); }); - PB.registerPipelineParsingCallback(parseFunctionPipeline); + PB.registerPipelineParsingCallback( + [PIC](StringRef Name, FunctionPassManager &FPM, + ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool { + ExitOnError Err("Unable to parse Polly module pass: "); + return Err(parseFunctionPipeline(Name, FPM, PIC, Pipeline)); + }); PB.registerPipelineParsingCallback( [PIC](StringRef Name, FunctionPassManager &FPM, ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool { @@ -635,6 +780,12 @@ void registerPollyPasses(PassBuilder &PB) { ExitOnError Err("Unable to parse Polly call graph pass: "); return Err(parseCGPipeline(Name, CGPM, PIC, Pipeline)); }); + PB.registerPipelineParsingCallback( + [PIC](StringRef Name, ModulePassManager &MPM, + ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool { + ExitOnError Err("Unable to parse Polly module pass: "); + return Err(parseModulePipeline(Name, MPM, PIC, Pipeline)); + }); PB.registerParseTopLevelPipelineCallback( [PIC](llvm::ModulePassManager &MPM, ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool { diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp index a2328d1bbb3cf..cf0ec4432f747 100644 --- a/polly/lib/Support/ScopHelper.cpp +++ b/polly/lib/Support/ScopHelper.cpp @@ -206,18 +206,6 @@ void polly::splitEntryBlockForAlloca(BasicBlock *EntryBlock, DominatorTree *DT, splitBlock(EntryBlock, I, DT, LI, RI); } -void polly::splitEntryBlockForAlloca(BasicBlock *EntryBlock, Pass *P) { - auto *DTWP = P->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; - auto *LIWP = P->getAnalysisIfAvailable<LoopInfoWrapperPass>(); - auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - RegionInfoPass *RIP = P->getAnalysisIfAvailable<RegionInfoPass>(); - RegionInfo *RI = RIP ? &RIP->getRegionInfo() : nullptr; - - // splitBlock updates DT, LI and RI. - polly::splitEntryBlockForAlloca(EntryBlock, DT, LI, RI); -} - void polly::recordAssumption(polly::RecordedAssumptionsTy *RecordedAssumptions, polly::AssumptionKind Kind, isl::set Set, DebugLoc Loc, polly::AssumptionSign Sign, diff --git a/polly/lib/Transform/Canonicalization.cpp b/polly/lib/Transform/Canonicalization.cpp index 1be560e64af40..cd7195f5374df 100644 --- a/polly/lib/Transform/Canonicalization.cpp +++ b/polly/lib/Transform/Canonicalization.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "polly/Canonicalization.h" -#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -39,24 +38,6 @@ static cl::opt<bool> cl::desc("Run an early inliner pass before Polly"), cl::Hidden, cl::cat(PollyCategory)); -void polly::registerCanonicalicationPasses(llvm::legacy::PassManagerBase &PM) { - bool UseMemSSA = true; - PM.add(llvm::createPromoteMemoryToRegisterPass()); - PM.add(llvm::createEarlyCSEPass(UseMemSSA)); - PM.add(llvm::createInstructionCombiningPass()); - PM.add(llvm::createCFGSimplificationPass()); - PM.add(llvm::createTailCallEliminationPass()); - PM.add(llvm::createCFGSimplificationPass()); - PM.add(llvm::createReassociatePass()); - if (PollyInliner) { - PM.add(llvm::createPromoteMemoryToRegisterPass()); - PM.add(llvm::createCFGSimplificationPass()); - PM.add(llvm::createInstructionCombiningPass()); - PM.add(createBarrierNoopPass()); - } - PM.add(llvm::createInstructionCombiningPass()); -} - /// Adapted from llvm::PassBuilder::buildInlinerPipeline static ModuleInlinerWrapperPass buildInlinePasses(llvm::OptimizationLevel Level) { @@ -125,49 +106,3 @@ polly::buildCanonicalicationPassesForNPM(llvm::ModulePassManager &MPM, return FPM; } - -namespace { -class PollyCanonicalize final : public ModulePass { - PollyCanonicalize(const PollyCanonicalize &) = delete; - const PollyCanonicalize &operator=(const PollyCanonicalize &) = delete; - -public: - static char ID; - - explicit PollyCanonicalize() : ModulePass(ID) {} - ~PollyCanonicalize(); - - /// @name FunctionPass interface. - //@{ - void getAnalysisUsage(AnalysisUsage &AU) const override; - void releaseMemory() override; - bool runOnModule(Module &M) override; - void print(raw_ostream &OS, const Module *) const override; - //@} -}; -} // namespace - -PollyCanonicalize::~PollyCanonicalize() {} - -void PollyCanonicalize::getAnalysisUsage(AnalysisUsage &AU) const {} - -void PollyCanonicalize::releaseMemory() {} - -bool PollyCanonicalize::runOnModule(Module &M) { - legacy::PassManager PM; - registerCanonicalicationPasses(PM); - PM.run(M); - - return true; -} - -void PollyCanonicalize::print(raw_ostream &OS, const Module *) const {} - -char PollyCanonicalize::ID = 0; - -Pass *polly::createPollyCanonicalizePass() { return new PollyCanonicalize(); } - -INITIALIZE_PASS_BEGIN(PollyCanonicalize, "polly-canonicalize", - "Polly - Run canonicalization passes", false, false) -INITIALIZE_PASS_END(PollyCanonicalize, "polly-canonicalize", - "Polly - Run canonicalization passes", false, false) diff --git a/polly/lib/Transform/CodePreparation.cpp b/polly/lib/Transform/CodePreparation.cpp index d045fb6b62c90..5b96c865ad80f 100644 --- a/polly/lib/Transform/CodePreparation.cpp +++ b/polly/lib/Transform/CodePreparation.cpp @@ -16,13 +16,11 @@ //===----------------------------------------------------------------------===// #include "polly/CodePreparation.h" -#include "polly/LinkAllPasses.h" #include "polly/Support/ScopHelper.h" #include "llvm/Analysis/DominanceFrontier.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/InitializePasses.h" using namespace llvm; using namespace polly; @@ -47,32 +45,6 @@ static bool runCodePreprationImpl(Function &F, DominatorTree *DT, LoopInfo *LI, return true; } -namespace { - -/// Prepare the IR for the scop detection. -/// -class CodePreparation final : public FunctionPass { - CodePreparation(const CodePreparation &) = delete; - const CodePreparation &operator=(const CodePreparation &) = delete; - - void clear(); - -public: - static char ID; - - explicit CodePreparation() : FunctionPass(ID) {} - ~CodePreparation(); - - /// @name FunctionPass interface. - //@{ - void getAnalysisUsage(AnalysisUsage &AU) const override; - void releaseMemory() override; - bool runOnFunction(Function &F) override; - void print(raw_ostream &OS, const Module *) const override; - //@} -}; -} // namespace - PreservedAnalyses CodePreparationPass::run(Function &F, FunctionAnalysisManager &FAM) { auto &DT = FAM.getResult<DominatorTreeAnalysis>(F); @@ -86,44 +58,3 @@ PreservedAnalyses CodePreparationPass::run(Function &F, PA.preserve<LoopAnalysis>(); return PA; } - -void CodePreparation::clear() {} - -CodePreparation::~CodePreparation() { clear(); } - -void CodePreparation::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<LoopInfoWrapperPass>(); - - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addPreserved<RegionInfoPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<DominanceFrontierWrapperPass>(); -} - -bool CodePreparation::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - RegionInfo *RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); - - runCodePreprationImpl(F, DT, LI, RI); - - return true; -} - -void CodePreparation::releaseMemory() { clear(); } - -void CodePreparation::print(raw_ostream &OS, const Module *) const {} - -char CodePreparation::ID = 0; -char &polly::CodePreparationID = CodePreparation::ID; - -Pass *polly::createCodePreparationPass() { return new CodePreparation(); } - -INITIALIZE_PASS_BEGIN(CodePreparation, "polly-prepare", - "Polly - Prepare code for polly", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_END(CodePreparation, "polly-prepare", - "Polly - Prepare code for polly", false, false) diff --git a/polly/lib/Transform/DeLICM.cpp b/polly/lib/Transform/DeLICM.cpp index 9a9768afe113e..e8f2d951404f3 100644 --- a/polly/lib/Transform/DeLICM.cpp +++ b/polly/lib/Transform/DeLICM.cpp @@ -15,7 +15,6 @@ //===----------------------------------------------------------------------===// #include "polly/DeLICM.h" -#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/ScopPass.h" @@ -25,7 +24,6 @@ #include "polly/ZoneAlgo.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Module.h" -#include "llvm/InitializePasses.h" #include "polly/Support/PollyDebug.h" #define DEBUG_TYPE "polly-delicm" @@ -35,6 +33,10 @@ using namespace llvm; namespace { +static cl::opt<bool> PollyPrintDeLICM("polly-print-delicm", + cl::desc("Polly - Print DeLICM/DePRE"), + cl::cat(PollyCategory)); + cl::opt<int> DelicmMaxOps("polly-delicm-max-ops", cl::desc("Maximum number of isl operations to invest for " @@ -1356,7 +1358,10 @@ class DeLICMImpl final : public ZoneAlgorithm { } /// Return whether at least one transformation been applied. - bool isModified() const { return NumberOfTargetsMapped > 0; } + bool isModified() const { + return NumberOfTargetsMapped > 0 || NumberOfMappedValueScalars > 0 || + NumberOfMappedPHIScalars > 0; + } }; static std::unique_ptr<DeLICMImpl> collapseToUnused(Scop &S, LoopInfo &LI) { @@ -1376,7 +1381,7 @@ static std::unique_ptr<DeLICMImpl> collapseToUnused(Scop &S, LoopInfo &LI) { return Impl; } -static std::unique_ptr<DeLICMImpl> runDeLICM(Scop &S, LoopInfo &LI) { +static std::unique_ptr<DeLICMImpl> runDeLICMImpl(Scop &S, LoopInfo &LI) { std::unique_ptr<DeLICMImpl> Impl = collapseToUnused(S, LI); Scop::ScopStatistics ScopStats = S.getStatistics(); @@ -1394,7 +1399,7 @@ static PreservedAnalyses runDeLICMUsingNPM(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &U, raw_ostream *OS) { LoopInfo &LI = SAR.LI; - std::unique_ptr<DeLICMImpl> Impl = runDeLICM(S, LI); + std::unique_ptr<DeLICMImpl> Impl = runDeLICMImpl(S, LI); if (OS) { *OS << "Printing analysis 'Polly - DeLICM/DePRE' for region: '" @@ -1417,88 +1422,8 @@ static PreservedAnalyses runDeLICMUsingNPM(Scop &S, ScopAnalysisManager &SAM, PA.preserveSet<AllAnalysesOn<Loop>>(); return PA; } - -class DeLICMWrapperPass final : public ScopPass { -private: - DeLICMWrapperPass(const DeLICMWrapperPass &) = delete; - const DeLICMWrapperPass &operator=(const DeLICMWrapperPass &) = delete; - - /// The pass implementation, also holding per-scop data. - std::unique_ptr<DeLICMImpl> Impl; - -public: - static char ID; - explicit DeLICMWrapperPass() : ScopPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequiredTransitive<ScopInfoRegionPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.setPreservesAll(); - } - - bool runOnScop(Scop &S) override { - // Free resources for previous scop's computation, if not yet done. - releaseMemory(); - - auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - Impl = runDeLICM(S, LI); - - return Impl->isModified(); - } - - void printScop(raw_ostream &OS, Scop &S) const override { - if (!Impl) - return; - assert(Impl->getScop() == &S); - - OS << "DeLICM result:\n"; - Impl->print(OS); - } - - void releaseMemory() override { Impl.reset(); } -}; - -char DeLICMWrapperPass::ID; - -/// Print result from DeLICMWrapperPass. -class DeLICMPrinterLegacyPass final : public ScopPass { -public: - static char ID; - - DeLICMPrinterLegacyPass() : DeLICMPrinterLegacyPass(outs()) {} - explicit DeLICMPrinterLegacyPass(llvm::raw_ostream &OS) - : ScopPass(ID), OS(OS) {} - - bool runOnScop(Scop &S) override { - DeLICMWrapperPass &P = getAnalysis<DeLICMWrapperPass>(); - - OS << "Printing analysis '" << P.getPassName() << "' for region: '" - << S.getRegion().getNameStr() << "' in function '" - << S.getFunction().getName() << "':\n"; - P.printScop(OS, S); - - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<DeLICMWrapperPass>(); - AU.setPreservesAll(); - } - -private: - llvm::raw_ostream &OS; -}; - -char DeLICMPrinterLegacyPass::ID = 0; } // anonymous namespace -Pass *polly::createDeLICMWrapperPass() { return new DeLICMWrapperPass(); } - -llvm::Pass *polly::createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS) { - return new DeLICMPrinterLegacyPass(OS); -} - llvm::PreservedAnalyses polly::DeLICMPass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, @@ -1527,15 +1452,21 @@ bool polly::isConflicting( return Knowledge::isConflicting(Existing, Proposed, OS, Indent); } -INITIALIZE_PASS_BEGIN(DeLICMWrapperPass, "polly-delicm", "Polly - DeLICM/DePRE", - false, false) -INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_END(DeLICMWrapperPass, "polly-delicm", "Polly - DeLICM/DePRE", - false, false) - -INITIALIZE_PASS_BEGIN(DeLICMPrinterLegacyPass, "polly-print-delicm", - "Polly - Print DeLICM/DePRE", false, false) -INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass) -INITIALIZE_PASS_END(DeLICMPrinterLegacyPass, "polly-print-delicm", - "Polly - Print DeLICM/DePRE", false, false) +bool polly::runDeLICM(Scop &S) { + LoopInfo &LI = *S.getLI(); + std::unique_ptr<DeLICMImpl> Impl = runDeLICMImpl(S, LI); + + if (PollyPrintDeLICM) { + outs() << "Printing analysis 'Polly - DeLICM/DePRE' for region: '" + << S.getName() << "' in function '" << S.getFunction().getName() + << "':\n"; + if (Impl) { + assert(Impl->getScop() == &S); + + outs() << "DeLICM result:\n"; + Impl->print(outs()); + } + } + + return Impl->isModified(); +} diff --git a/polly/lib/Transform/DeadCodeElimination.cpp b/polly/lib/Transform/DeadCodeElimination.cpp index 5cb89fec09fe8..df95e5190431c 100644 --- a/polly/lib/Transform/DeadCodeElimination.cpp +++ b/polly/lib/Transform/DeadCodeElimination.cpp @@ -33,7 +33,6 @@ #include "polly/DeadCodeElimination.h" #include "polly/DependenceInfo.h" -#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" #include "llvm/Support/CommandLine.h" @@ -51,20 +50,6 @@ cl::opt<int> DCEPreciseSteps( "before the actual dead code elimination."), cl::init(-1), cl::cat(PollyCategory)); -class DeadCodeElimWrapperPass final : public ScopPass { -public: - static char ID; - explicit DeadCodeElimWrapperPass() : ScopPass(ID) {} - - /// Remove dead iterations from the schedule of @p S. - bool runOnScop(Scop &S) override; - - /// Register all analyses and transformation required. - void getAnalysisUsage(AnalysisUsage &AU) const override; -}; - -char DeadCodeElimWrapperPass::ID = 0; - /// Return the set of live iterations. /// /// The set of live iterations are all iterations that write to memory and for @@ -144,29 +129,19 @@ static bool runDeadCodeElimination(Scop &S, int PreciseSteps, return S.restrictDomains(Live); } -bool DeadCodeElimWrapperPass::runOnScop(Scop &S) { - auto &DI = getAnalysis<DependenceInfo>(); - const Dependences &Deps = DI.getDependences(Dependences::AL_Statement); +} // namespace + +bool polly::runDeadCodeElim(Scop &S, DependenceAnalysis::Result &DA) { + const Dependences &Deps = DA.getDependences(Dependences::AL_Statement); bool Changed = runDeadCodeElimination(S, DCEPreciseSteps, Deps); // FIXME: We can probably avoid the recomputation of all dependences by // updating them explicitly. if (Changed) - DI.recomputeDependences(Dependences::AL_Statement); - - return false; -} - -void DeadCodeElimWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<DependenceInfo>(); -} - -} // namespace + DA.recomputeDependences(Dependences::AL_Statement); -Pass *polly::createDeadCodeElimWrapperPass() { - return new DeadCodeElimWrapperPass(); + return Changed; } llvm::PreservedAnalyses DeadCodeElimPass::run(Scop &S, ScopAnalysisManager &SAM, @@ -191,10 +166,3 @@ llvm::PreservedAnalyses DeadCodeElimPass::run(Scop &S, ScopAnalysisManager &SAM, PA.preserveSet<AllAnalysesOn<Loop>>(); return PA; } - -INITIALIZE_PASS_BEGIN(DeadCodeElimWrapperPass, "polly-dce", - "Polly - Remove dead iterations", false, false) -INITIALIZE_PASS_DEPENDENCY(DependenceInfo) -INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass) -INITIALIZE_PASS_END(DeadCodeElimWrapperPass, "polly-dce", - "Polly - Remove dead iterations", false, false) diff --git a/polly/lib/Transform/FlattenSchedule.cpp b/polly/lib/Transform/FlattenSchedule.cpp index f514ef359ba07..35a8ce6877036 100644 --- a/polly/lib/Transform/FlattenSchedule.cpp +++ b/polly/lib/Transform/FlattenSchedule.cpp @@ -14,6 +14,7 @@ #include "polly/FlattenSchedule.h" #include "polly/FlattenAlgo.h" +#include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/ScopPass.h" #include "polly/Support/ISLOStream.h" @@ -26,6 +27,10 @@ using namespace llvm; namespace { +static cl::opt<bool> PollyPrintFlattenSchedule("polly-print-flatten-schedule", + cl::desc("A polly pass"), + cl::cat(PollyCategory)); + /// Print a schedule to @p OS. /// /// Prints the schedule for each statements on a new line. @@ -34,119 +39,45 @@ void printSchedule(raw_ostream &OS, const isl::union_map &Schedule, for (isl::map Map : Schedule.get_map_list()) OS.indent(indent) << Map << "\n"; } +} // namespace -/// Flatten the schedule stored in an polly::Scop. -class FlattenSchedule final : public ScopPass { -private: - FlattenSchedule(const FlattenSchedule &) = delete; - const FlattenSchedule &operator=(const FlattenSchedule &) = delete; - - std::shared_ptr<isl_ctx> IslCtx; - isl::union_map OldSchedule; - -public: - static char ID; - explicit FlattenSchedule() : ScopPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequiredTransitive<ScopInfoRegionPass>(); - AU.setPreservesAll(); - } - - bool runOnScop(Scop &S) override { - // Keep a reference to isl_ctx to ensure that it is not freed before we free - // OldSchedule. - IslCtx = S.getSharedIslCtx(); +void polly::runFlattenSchedulePass(Scop &S) { + // Keep a reference to isl_ctx to ensure that it is not freed before we free + // OldSchedule. + auto IslCtx = S.getSharedIslCtx(); - POLLY_DEBUG(dbgs() << "Going to flatten old schedule:\n"); - OldSchedule = S.getSchedule(); - POLLY_DEBUG(printSchedule(dbgs(), OldSchedule, 2)); + POLLY_DEBUG(dbgs() << "Going to flatten old schedule:\n"); + auto OldSchedule = S.getSchedule(); + POLLY_DEBUG(printSchedule(dbgs(), OldSchedule, 2)); - auto Domains = S.getDomains(); - auto RestrictedOldSchedule = OldSchedule.intersect_domain(Domains); - POLLY_DEBUG(dbgs() << "Old schedule with domains:\n"); - POLLY_DEBUG(printSchedule(dbgs(), RestrictedOldSchedule, 2)); + auto Domains = S.getDomains(); + auto RestrictedOldSchedule = OldSchedule.intersect_domain(Domains); + POLLY_DEBUG(dbgs() << "Old schedule with domains:\n"); + POLLY_DEBUG(printSchedule(dbgs(), RestrictedOldSchedule, 2)); - auto NewSchedule = flattenSchedule(RestrictedOldSchedule); + auto NewSchedule = flattenSchedule(RestrictedOldSchedule); - POLLY_DEBUG(dbgs() << "Flattened new schedule:\n"); - POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2)); + POLLY_DEBUG(dbgs() << "Flattened new schedule:\n"); + POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2)); - NewSchedule = NewSchedule.gist_domain(Domains); - POLLY_DEBUG(dbgs() << "Gisted, flattened new schedule:\n"); - POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2)); + NewSchedule = NewSchedule.gist_domain(Domains); + POLLY_DEBUG(dbgs() << "Gisted, flattened new schedule:\n"); + POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2)); - S.setSchedule(NewSchedule); - return false; - } + S.setSchedule(NewSchedule); - void printScop(raw_ostream &OS, Scop &S) const override { - OS << "Schedule before flattening {\n"; - printSchedule(OS, OldSchedule, 4); - OS << "}\n\n"; + if (PollyPrintFlattenSchedule) { + outs() + << "Printing analysis 'Polly - Print flattened schedule' for region: '" + << S.getRegion().getNameStr() << "' in function '" + << S.getFunction().getName() << "':\n"; - OS << "Schedule after flattening {\n"; - printSchedule(OS, S.getSchedule(), 4); - OS << "}\n"; - } + outs() << "Schedule before flattening {\n"; + printSchedule(outs(), OldSchedule, 4); + outs() << "}\n\n"; - void releaseMemory() override { - OldSchedule = {}; - IslCtx.reset(); + outs() << "Schedule after flattening {\n"; + printSchedule(outs(), S.getSchedule(), 4); + outs() << "}\n"; } -}; - -char FlattenSchedule::ID; - -/// Print result from FlattenSchedule. -class FlattenSchedulePrinterLegacyPass final : public ScopPass { -public: - static char ID; - - FlattenSchedulePrinterLegacyPass() - : FlattenSchedulePrinterLegacyPass(outs()) {} - explicit FlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS) - : ScopPass(ID), OS(OS) {} - - bool runOnScop(Scop &S) override { - FlattenSchedule &P = getAnalysis<FlattenSchedule>(); - - OS << "Printing analysis '" << P.getPassName() << "' for region: '" - << S.getRegion().getNameStr() << "' in function '" - << S.getFunction().getName() << "':\n"; - P.printScop(OS, S); - - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<FlattenSchedule>(); - AU.setPreservesAll(); - } - -private: - llvm::raw_ostream &OS; -}; - -char FlattenSchedulePrinterLegacyPass::ID = 0; -} // anonymous namespace - -Pass *polly::createFlattenSchedulePass() { return new FlattenSchedule(); } - -Pass *polly::createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS) { - return new FlattenSchedulePrinterLegacyPass(OS); } - -INITIALIZE_PASS_BEGIN(FlattenSchedule, "polly-flatten-schedule", - "Polly - Flatten schedule", false, false) -INITIALIZE_PASS_END(FlattenSchedule, "polly-flatten-schedule", - "Polly - Flatten schedule", false, false) - -INITIALIZE_PASS_BEGIN(FlattenSchedulePrinterLegacyPass, - "polly-print-flatten-schedule", - "Polly - Print flattened schedule", false, false) -INITIALIZE_PASS_DEPENDENCY(FlattenSchedule) -INITIALIZE_PASS_END(FlattenSchedulePrinterLegacyPass, - "polly-print-flatten-schedule", - "Polly - Print flattened schedule", false, false) diff --git a/polly/lib/Transform/ForwardOpTree.cpp b/polly/lib/Transform/ForwardOpTree.cpp index e9be6c9cdcc27..24d4a4af6e681 100644 --- a/polly/lib/Transform/ForwardOpTree.cpp +++ b/polly/lib/Transform/ForwardOpTree.cpp @@ -28,7 +28,6 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Value.h" -#include "llvm/InitializePasses.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" @@ -62,6 +61,11 @@ static cl::opt<unsigned> "analysis; 0=no limit"), cl::init(1000000), cl::cat(PollyCategory), cl::Hidden); +static cl::opt<bool> + PollyPrintOptree("polly-print-optree", + cl::desc("Polly - Print forward operand tree result"), + cl::cat(PollyCategory)); + STATISTIC(KnownAnalyzed, "Number of successfully analyzed SCoPs"); STATISTIC(KnownOutOfQuota, "Analyses aborted because max_operations was reached"); @@ -1030,8 +1034,8 @@ class ForwardOpTreeImpl final : ZoneAlgorithm { bool isModified() const { return Modified; } }; -static std::unique_ptr<ForwardOpTreeImpl> runForwardOpTree(Scop &S, - LoopInfo &LI) { +static std::unique_ptr<ForwardOpTreeImpl> runForwardOpTreeImpl(Scop &S, + LoopInfo &LI) { std::unique_ptr<ForwardOpTreeImpl> Impl; { IslMaxOperationsGuard MaxOpGuard(S.getIslCtx().get(), MaxOps, false); @@ -1073,7 +1077,7 @@ runForwardOpTreeUsingNPM(Scop &S, ScopAnalysisManager &SAM, raw_ostream *OS) { LoopInfo &LI = SAR.LI; - std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTree(S, LI); + std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTreeImpl(S, LI); if (OS) { *OS << "Printing analysis 'Polly - Forward operand tree' for region: '" << S.getName() << "' in function '" << S.getFunction().getName() @@ -1094,99 +1098,8 @@ runForwardOpTreeUsingNPM(Scop &S, ScopAnalysisManager &SAM, PA.preserveSet<AllAnalysesOn<Loop>>(); return PA; } - -/// Pass that redirects scalar reads to array elements that are known to contain -/// the same value. -/// -/// This reduces the number of scalar accesses and therefore potentially -/// increases the freedom of the scheduler. In the ideal case, all reads of a -/// scalar definition are redirected (We currently do not care about removing -/// the write in this case). This is also useful for the main DeLICM pass as -/// there are less scalars to be mapped. -class ForwardOpTreeWrapperPass final : public ScopPass { -private: - /// The pass implementation, also holding per-scop data. - std::unique_ptr<ForwardOpTreeImpl> Impl; - -public: - static char ID; - - explicit ForwardOpTreeWrapperPass() : ScopPass(ID) {} - ForwardOpTreeWrapperPass(const ForwardOpTreeWrapperPass &) = delete; - ForwardOpTreeWrapperPass & - operator=(const ForwardOpTreeWrapperPass &) = delete; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequiredTransitive<ScopInfoRegionPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.setPreservesAll(); - } - - bool runOnScop(Scop &S) override { - // Free resources for previous SCoP's computation, if not yet done. - releaseMemory(); - - LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - - Impl = runForwardOpTree(S, LI); - - return false; - } - - void printScop(raw_ostream &OS, Scop &S) const override { - if (!Impl) - return; - - assert(Impl->getScop() == &S); - Impl->print(OS); - } - - void releaseMemory() override { Impl.reset(); } -}; // class ForwardOpTree - -char ForwardOpTreeWrapperPass::ID; - -/// Print result from ForwardOpTreeWrapperPass. -class ForwardOpTreePrinterLegacyPass final : public ScopPass { -public: - static char ID; - - ForwardOpTreePrinterLegacyPass() : ForwardOpTreePrinterLegacyPass(outs()) {} - explicit ForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS) - : ScopPass(ID), OS(OS) {} - - bool runOnScop(Scop &S) override { - ForwardOpTreeWrapperPass &P = getAnalysis<ForwardOpTreeWrapperPass>(); - - OS << "Printing analysis '" << P.getPassName() << "' for region: '" - << S.getRegion().getNameStr() << "' in function '" - << S.getFunction().getName() << "':\n"; - P.printScop(OS, S); - - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<ForwardOpTreeWrapperPass>(); - AU.setPreservesAll(); - } - -private: - llvm::raw_ostream &OS; -}; - -char ForwardOpTreePrinterLegacyPass::ID = 0; } // namespace -Pass *polly::createForwardOpTreeWrapperPass() { - return new ForwardOpTreeWrapperPass(); -} - -Pass *polly::createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS) { - return new ForwardOpTreePrinterLegacyPass(OS); -} - llvm::PreservedAnalyses ForwardOpTreePass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, @@ -1200,14 +1113,20 @@ ForwardOpTreePrinterPass::run(Scop &S, ScopAnalysisManager &SAM, return runForwardOpTreeUsingNPM(S, SAM, SAR, U, &OS); } -INITIALIZE_PASS_BEGIN(ForwardOpTreeWrapperPass, "polly-optree", - "Polly - Forward operand tree", false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_END(ForwardOpTreeWrapperPass, "polly-optree", - "Polly - Forward operand tree", false, false) - -INITIALIZE_PASS_BEGIN(ForwardOpTreePrinterLegacyPass, "polly-print-optree", - "Polly - Print forward operand tree result", false, false) -INITIALIZE_PASS_DEPENDENCY(ForwardOpTreeWrapperPass) -INITIALIZE_PASS_END(ForwardOpTreePrinterLegacyPass, "polly-print-optree", - "Polly - Print forward operand tree result", false, false) +bool polly::runForwardOpTree(Scop &S) { + LoopInfo &LI = *S.getLI(); + + std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTreeImpl(S, LI); + if (PollyPrintOptree) { + outs() << "Printing analysis 'Polly - Forward operand tree' for region: '" + << S.getName() << "' in function '" << S.getFunction().getName() + << "':\n"; + if (Impl) { + assert(Impl->getScop() == &S); + + Impl->print(outs()); + } + } + + return Impl->isModified(); +} diff --git a/polly/lib/Transform/MaximalStaticExpansion.cpp b/polly/lib/Transform/MaximalStaticExpansion.cpp index 0719840f74a79..62a4d251875c5 100644 --- a/polly/lib/Transform/MaximalStaticExpansion.cpp +++ b/polly/lib/Transform/MaximalStaticExpansion.cpp @@ -13,14 +13,13 @@ #include "polly/MaximalStaticExpansion.h" #include "polly/DependenceInfo.h" -#include "polly/LinkAllPasses.h" +#include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/ScopPass.h" #include "polly/Support/ISLTools.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/InitializePasses.h" #include "isl/isl-noexceptions.h" #include "isl/union_map.h" #include <cassert> @@ -35,28 +34,10 @@ using namespace polly; namespace { -class MaximalStaticExpanderWrapperPass final : public ScopPass { -public: - static char ID; - - explicit MaximalStaticExpanderWrapperPass() : ScopPass(ID) {} - - ~MaximalStaticExpanderWrapperPass() override = default; - - /// Expand the accesses of the SCoP. - /// - /// @param S The SCoP that must be expanded. - bool runOnScop(Scop &S) override; - - /// Print the SCoP. - /// - /// @param OS The stream where to print. - /// @param S The SCop that must be printed. - void printScop(raw_ostream &OS, Scop &S) const override; - - /// Register all analyses and transformations required. - void getAnalysisUsage(AnalysisUsage &AU) const override; -}; +static cl::opt<bool> + PollyPrintMSE("polly-print-mse", + cl::desc("Polly - Print Maximal static expansion of SCoP"), + cl::cat(PollyCategory)); #ifndef NDEBUG /// Whether a dimension of a set is bounded (lower and upper) by a constant, @@ -458,8 +439,8 @@ class MaximalStaticExpansionImpl { }; static std::unique_ptr<MaximalStaticExpansionImpl> -runMaximalStaticExpansion(Scop &S, OptimizationRemarkEmitter &ORE, - const Dependences &D) { +runMaximalStaticExpansionImpl(Scop &S, OptimizationRemarkEmitter &ORE, + const Dependences &D) { auto Dependences = D.getDependences(Dependences::TYPE_RAW); std::unique_ptr<MaximalStaticExpansionImpl> Impl = @@ -478,7 +459,7 @@ static PreservedAnalyses runMSEUsingNPM(Scop &S, ScopAnalysisManager &SAM, auto &D = DI.getDependences(Dependences::AL_Reference); std::unique_ptr<MaximalStaticExpansionImpl> Impl = - runMaximalStaticExpansion(S, ORE, D); + runMaximalStaticExpansionImpl(S, ORE, D); if (OS) { *OS << "Printing analysis 'Polly - Maximal static expansion of SCoP' for " @@ -511,42 +492,24 @@ MaximalStaticExpansionPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, return runMSEUsingNPM(S, SAM, SAR, &OS); } -char MaximalStaticExpanderWrapperPass::ID = 0; - -bool MaximalStaticExpanderWrapperPass::runOnScop(Scop &S) { - // Get the ORE from OptimizationRemarkEmitterWrapperPass. - OptimizationRemarkEmitter *ORE = - &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); +void polly::runMaximalStaticExpansion(Scop &S, DependenceAnalysis::Result &DI) { + OptimizationRemarkEmitter ORE(&S.getFunction()); - // Get the RAW Dependences. - auto &DI = getAnalysis<DependenceInfo>(); auto &D = DI.getDependences(Dependences::AL_Reference); std::unique_ptr<MaximalStaticExpansionImpl> Impl = - runMaximalStaticExpansion(S, *ORE, D); + runMaximalStaticExpansionImpl(S, ORE, D); - return false; -} - -void MaximalStaticExpanderWrapperPass::printScop(raw_ostream &OS, - Scop &S) const { - S.print(OS, false); -} - -void MaximalStaticExpanderWrapperPass::getAnalysisUsage( - AnalysisUsage &AU) const { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<DependenceInfo>(); - AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); -} + if (PollyPrintMSE) { + outs() + << "Printing analysis 'Polly - Maximal static expansion of SCoP' for " + "region: '" + << S.getName() << "' in function '" << S.getFunction().getName() + << "':\n"; -Pass *polly::createMaximalStaticExpansionPass() { - return new MaximalStaticExpanderWrapperPass(); + if (Impl) { + outs() << "MSE result:\n"; + Impl->print(llvm::outs()); + } + } } - -INITIALIZE_PASS_BEGIN(MaximalStaticExpanderWrapperPass, "polly-mse", - "Polly - Maximal static expansion of SCoP", false, false); -INITIALIZE_PASS_DEPENDENCY(DependenceInfo); -INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); -INITIALIZE_PASS_END(MaximalStaticExpanderWrapperPass, "polly-mse", - "Polly - Maximal static expansion of SCoP", false, false) diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index 0888ebd7a9362..6acdd6862c4cc 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -57,7 +57,6 @@ #include "llvm/ADT/Sequence.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "isl/options.h" @@ -198,6 +197,10 @@ static cl::opt<bool> OptimizedScops( "transformations is applied on the schedule tree"), cl::cat(PollyCategory)); +static cl::opt<bool> PollyPrintOptIsl("polly-print-opt-isl", + cl::desc("A polly pass"), + cl::cat(PollyCategory)); + STATISTIC(ScopsProcessed, "Number of scops processed"); STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); STATISTIC(ScopsOptimized, "Number of scops optimized"); @@ -638,34 +641,6 @@ bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S, return changed; } -class IslScheduleOptimizerWrapperPass final : public ScopPass { -public: - static char ID; - - explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {} - - /// Optimize the schedule of the SCoP @p S. - bool runOnScop(Scop &S) override; - - /// Print the new schedule for the SCoP @p S. - void printScop(raw_ostream &OS, Scop &S) const override; - - /// Register all analyses and transformation required. - void getAnalysisUsage(AnalysisUsage &AU) const override; - - /// Release the internal memory. - void releaseMemory() override { - LastSchedule = {}; - IslCtx.reset(); - } - -private: - std::shared_ptr<isl_ctx> IslCtx; - isl::schedule LastSchedule; -}; - -char IslScheduleOptimizerWrapperPass::ID = 0; - #ifndef NDEBUG static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule, StringRef Desc) { @@ -733,7 +708,7 @@ static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { &Version); } -static void runIslScheduleOptimizer( +static void runIslScheduleOptimizerImpl( Scop &S, function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps, TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, @@ -950,30 +925,6 @@ static void runIslScheduleOptimizer( errs() << S; } -bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) { - releaseMemory(); - - Function &F = S.getFunction(); - IslCtx = S.getSharedIslCtx(); - - auto getDependences = - [this](Dependences::AnalysisLevel) -> const Dependences & { - return getAnalysis<DependenceInfo>().getDependences( - Dependences::AL_Statement); - }; - OptimizationRemarkEmitter &ORE = - getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - TargetTransformInfo *TTI = - &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - - bool DepsChanged = false; - runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule, - DepsChanged); - if (DepsChanged) - getAnalysis<DependenceInfo>().abandonDependences(); - return false; -} - static void runScheduleOptimizerPrinter(raw_ostream &OS, isl::schedule LastSchedule) { isl_printer *p; @@ -997,36 +948,8 @@ static void runScheduleOptimizerPrinter(raw_ostream &OS, free(ScheduleStr); } -void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const { - runScheduleOptimizerPrinter(OS, LastSchedule); -} - -void IslScheduleOptimizerWrapperPass::getAnalysisUsage( - AnalysisUsage &AU) const { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<DependenceInfo>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); - - AU.addPreserved<DependenceInfo>(); - AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); -} - } // namespace -Pass *polly::createIslScheduleOptimizerWrapperPass() { - return new IslScheduleOptimizerWrapperPass(); -} - -INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl", - "Polly - Optimize schedule of SCoP", false, false); -INITIALIZE_PASS_DEPENDENCY(DependenceInfo); -INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); -INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); -INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl", - "Polly - Optimize schedule of SCoP", false, false) - static llvm::PreservedAnalyses runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &U, @@ -1039,7 +962,7 @@ runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, TargetTransformInfo *TTI = &SAR.TTI; isl::schedule LastSchedule; bool DepsChanged = false; - runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged); + runIslScheduleOptimizerImpl(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged); if (DepsChanged) Deps.abandonDependences(); @@ -1065,52 +988,23 @@ IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS); } -//===----------------------------------------------------------------------===// - -namespace { -/// Print result from IslScheduleOptimizerWrapperPass. -class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass { -public: - static char ID; - - IslScheduleOptimizerPrinterLegacyPass() - : IslScheduleOptimizerPrinterLegacyPass(outs()) {} - explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS) - : ScopPass(ID), OS(OS) {} - - bool runOnScop(Scop &S) override { - IslScheduleOptimizerWrapperPass &P = - getAnalysis<IslScheduleOptimizerWrapperPass>(); - - OS << "Printing analysis '" << P.getPassName() << "' for region: '" - << S.getRegion().getNameStr() << "' in function '" - << S.getFunction().getName() << "':\n"; - P.printScop(OS, S); - - return false; - } +void polly::runIslScheduleOptimizer(Scop &S, TargetTransformInfo *TTI, + DependenceAnalysis::Result &Deps) { + auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & { + return Deps.getDependences(Dependences::AL_Statement); + }; + OptimizationRemarkEmitter ORE(&S.getFunction()); + isl::schedule LastSchedule; + bool DepsChanged = false; + runIslScheduleOptimizerImpl(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged); + if (DepsChanged) + Deps.abandonDependences(); - void getAnalysisUsage(AnalysisUsage &AU) const override { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<IslScheduleOptimizerWrapperPass>(); - AU.setPreservesAll(); + if (PollyPrintOptIsl) { + outs() + << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '" + << S.getName() << "' in function '" << S.getFunction().getName() + << "':\n"; + runScheduleOptimizerPrinter(outs(), LastSchedule); } - -private: - llvm::raw_ostream &OS; -}; - -char IslScheduleOptimizerPrinterLegacyPass::ID = 0; -} // namespace - -Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) { - return new IslScheduleOptimizerPrinterLegacyPass(OS); } - -INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass, - "polly-print-opt-isl", - "Polly - Print optimizer schedule of SCoP", false, false); -INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass) -INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass, - "polly-print-opt-isl", - "Polly - Print optimizer schedule of SCoP", false, false) diff --git a/polly/lib/Transform/ScopInliner.cpp b/polly/lib/Transform/ScopInliner.cpp index c04ba3498339e..8e7a0dedaf533 100644 --- a/polly/lib/Transform/ScopInliner.cpp +++ b/polly/lib/Transform/ScopInliner.cpp @@ -95,53 +95,7 @@ template <typename SCC_t> bool runScopInlinerImpl(Function *F, SCC_t &SCC) { return Changed; } - -class ScopInlinerWrapperPass final : public CallGraphSCCPass { - using llvm::Pass::doInitialization; - -public: - static char ID; - - ScopInlinerWrapperPass() : CallGraphSCCPass(ID) {} - - bool doInitialization(CallGraph &CG) override { - if (!polly::PollyAllowFullFunction) { - report_fatal_error( - "Aborting from ScopInliner because it only makes sense to run with " - "-polly-allow-full-function. " - "The heurtistic for ScopInliner checks that the full function is a " - "Scop, which happens if and only if polly-allow-full-function is " - " enabled. " - " If not, the entry block is not included in the Scop"); - } - return true; - } - - bool runOnSCC(CallGraphSCC &SCC) override { - Function *F = (*SCC.begin())->getFunction(); - return runScopInlinerImpl(F, SCC); - }; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - CallGraphSCCPass::getAnalysisUsage(AU); - } -}; } // namespace -char ScopInlinerWrapperPass::ID; - -Pass *polly::createScopInlinerWrapperPass() { - ScopInlinerWrapperPass *pass = new ScopInlinerWrapperPass(); - return pass; -} - -INITIALIZE_PASS_BEGIN( - ScopInlinerWrapperPass, "polly-scop-inliner", - "inline functions based on how much of the function is a scop.", false, - false) -INITIALIZE_PASS_END( - ScopInlinerWrapperPass, "polly-scop-inliner", - "inline functions based on how much of the function is a scop.", false, - false) polly::ScopInlinerPass::ScopInlinerPass() { if (!polly::PollyAllowFullFunction) { diff --git a/polly/lib/Transform/Simplify.cpp b/polly/lib/Transform/Simplify.cpp index 75e91cd1c031a..cf0f8c5ca5ef2 100644 --- a/polly/lib/Transform/Simplify.cpp +++ b/polly/lib/Transform/Simplify.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "polly/Simplify.h" +#include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/ScopPass.h" #include "polly/Support/GICHelper.h" @@ -18,7 +19,6 @@ #include "polly/Support/ISLTools.h" #include "polly/Support/VirtualInstruction.h" #include "llvm/ADT/Statistic.h" -#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include <optional> @@ -30,6 +30,11 @@ using namespace polly; namespace { +static cl::opt<bool> + PollyPrintSimplify("polly-print-simplify", + cl::desc("Polly - Print Simplify actions"), + cl::cat(PollyCategory)); + #define TWO_STATISTICS(VARNAME, DESC) \ static llvm::Statistic VARNAME[2] = { \ {DEBUG_TYPE, #VARNAME "0", DESC " (first)"}, \ @@ -756,39 +761,6 @@ void SimplifyImpl::printScop(raw_ostream &OS, Scop &S) const { printAccesses(OS); } -class SimplifyWrapperPass final : public ScopPass { -public: - static char ID; - int CallNo; - std::optional<SimplifyImpl> Impl; - - explicit SimplifyWrapperPass(int CallNo = 0) : ScopPass(ID), CallNo(CallNo) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequiredTransitive<ScopInfoRegionPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.setPreservesAll(); - } - - bool runOnScop(Scop &S) override { - LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - - Impl.emplace(CallNo); - Impl->run(S, LI); - - return false; - } - - void printScop(raw_ostream &OS, Scop &S) const override { - if (Impl) - Impl->printScop(OS, S); - } - - void releaseMemory() override { Impl.reset(); } -}; - -char SimplifyWrapperPass::ID; - static llvm::PreservedAnalyses runSimplifyUsingNPM(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &U, int CallNo, @@ -843,58 +815,15 @@ SmallVector<MemoryAccess *, 32> polly::getAccessesInOrder(ScopStmt &Stmt) { return Accesses; } -Pass *polly::createSimplifyWrapperPass(int CallNo) { - return new SimplifyWrapperPass(CallNo); -} - -INITIALIZE_PASS_BEGIN(SimplifyWrapperPass, "polly-simplify", "Polly - Simplify", - false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_END(SimplifyWrapperPass, "polly-simplify", "Polly - Simplify", - false, false) - -//===----------------------------------------------------------------------===// - -namespace { -/// Print result from SimplifyWrapperPass. -class SimplifyPrinterLegacyPass final : public ScopPass { -public: - static char ID; - - SimplifyPrinterLegacyPass() : SimplifyPrinterLegacyPass(outs()) {} - explicit SimplifyPrinterLegacyPass(llvm::raw_ostream &OS) - : ScopPass(ID), OS(OS) {} - - bool runOnScop(Scop &S) override { - SimplifyWrapperPass &P = getAnalysis<SimplifyWrapperPass>(); - - OS << "Printing analysis '" << P.getPassName() << "' for region: '" - << S.getRegion().getNameStr() << "' in function '" - << S.getFunction().getName() << "':\n"; - P.printScop(OS, S); - - return false; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - ScopPass::getAnalysisUsage(AU); - AU.addRequired<SimplifyWrapperPass>(); - AU.setPreservesAll(); +bool polly::runSimplify(Scop &S, int CallNo) { + SimplifyImpl Impl(CallNo); + Impl.run(S, S.getLI()); + if (PollyPrintSimplify) { + outs() << "Printing analysis 'Polly - Simplify' for region: '" + << S.getName() << "' in function '" << S.getFunction().getName() + << "':\n"; + Impl.printScop(outs(), S); } -private: - llvm::raw_ostream &OS; -}; - -char SimplifyPrinterLegacyPass::ID = 0; -} // namespace - -Pass *polly::createSimplifyPrinterLegacyPass(raw_ostream &OS) { - return new SimplifyPrinterLegacyPass(OS); + return Impl.isModified(); } - -INITIALIZE_PASS_BEGIN(SimplifyPrinterLegacyPass, "polly-print-simplify", - "Polly - Print Simplify actions", false, false) -INITIALIZE_PASS_DEPENDENCY(SimplifyWrapperPass) -INITIALIZE_PASS_END(SimplifyPrinterLegacyPass, "polly-print-simplify", - "Polly - Print Simplify actions", false, false) diff --git a/polly/test/CodeGen/20100617.ll b/polly/test/CodeGen/20100617.ll index 7229a6e3d5240..7de1b843a5b0a 100644 --- a/polly/test/CodeGen/20100617.ll +++ b/polly/test/CodeGen/20100617.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @init_array() nounwind { diff --git a/polly/test/CodeGen/20100622.ll b/polly/test/CodeGen/20100622.ll index bed737741abba..13a6159d3e7a7 100644 --- a/polly/test/CodeGen/20100622.ll +++ b/polly/test/CodeGen/20100622.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s | not FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s | not FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" diff --git a/polly/test/CodeGen/20100707.ll b/polly/test/CodeGen/20100707.ll index ee0422e07c4ea..6a4763dcb3b76 100644 --- a/polly/test/CodeGen/20100707.ll +++ b/polly/test/CodeGen/20100707.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @clause_SetSplitField(i32 %Length) nounwind inlinehint { diff --git a/polly/test/CodeGen/20100707_2.ll b/polly/test/CodeGen/20100707_2.ll index a4cd76af9dd3c..648a06479ae27 100644 --- a/polly/test/CodeGen/20100707_2.ll +++ b/polly/test/CodeGen/20100707_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @win193 = external global [4 x [36 x double]], align 32 ; <ptr> [#uses=3] diff --git a/polly/test/CodeGen/20100708.ll b/polly/test/CodeGen/20100708.ll index 9080451aeae50..52153d7cfa730 100644 --- a/polly/test/CodeGen/20100708.ll +++ b/polly/test/CodeGen/20100708.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define fastcc void @execute() nounwind { diff --git a/polly/test/CodeGen/20100708_2.ll b/polly/test/CodeGen/20100708_2.ll index 51dc9d311f070..075a4947c8e72 100644 --- a/polly/test/CodeGen/20100708_2.ll +++ b/polly/test/CodeGen/20100708_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @init_array() nounwind { diff --git a/polly/test/CodeGen/20100713.ll b/polly/test/CodeGen/20100713.ll index a836795c9907f..0b0ed7327c8b1 100644 --- a/polly/test/CodeGen/20100713.ll +++ b/polly/test/CodeGen/20100713.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @fft_float(i32 %NumSamples) nounwind { diff --git a/polly/test/CodeGen/20100713_2.ll b/polly/test/CodeGen/20100713_2.ll index 28b984bd5900f..5681f34152342 100644 --- a/polly/test/CodeGen/20100713_2.ll +++ b/polly/test/CodeGen/20100713_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define hidden void @luaD_callhook() nounwind { diff --git a/polly/test/CodeGen/20100717.ll b/polly/test/CodeGen/20100717.ll index 51c453cfe438e..97ed151410dfb 100644 --- a/polly/test/CodeGen/20100717.ll +++ b/polly/test/CodeGen/20100717.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @matrixTranspose(ptr %A) nounwind { diff --git a/polly/test/CodeGen/20100718-DomInfo-2.ll b/polly/test/CodeGen/20100718-DomInfo-2.ll index fdac75f1b999f..cbee80e44949c 100644 --- a/polly/test/CodeGen/20100718-DomInfo-2.ll +++ b/polly/test/CodeGen/20100718-DomInfo-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @getNonAffNeighbour() nounwind { diff --git a/polly/test/CodeGen/20100718-DomInfo.ll b/polly/test/CodeGen/20100718-DomInfo.ll index da68eb0dd8fa7..e6fcaf6a9272f 100644 --- a/polly/test/CodeGen/20100718-DomInfo.ll +++ b/polly/test/CodeGen/20100718-DomInfo.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @intrapred_luma_16x16(i32 %predmode) nounwind { diff --git a/polly/test/CodeGen/20100720-MultipleConditions.ll b/polly/test/CodeGen/20100720-MultipleConditions.ll index 3dece4efdcd06..66c9e2bb0eb5b 100644 --- a/polly/test/CodeGen/20100720-MultipleConditions.ll +++ b/polly/test/CodeGen/20100720-MultipleConditions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s ;int bar1(); ;int bar2(); diff --git a/polly/test/CodeGen/20100809-IndependentBlock.ll b/polly/test/CodeGen/20100809-IndependentBlock.ll index f45b6544464de..cc3a5087090b4 100644 --- a/polly/test/CodeGen/20100809-IndependentBlock.ll +++ b/polly/test/CodeGen/20100809-IndependentBlock.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @cfft2(ptr %x) nounwind { entry: diff --git a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll index 82da9d2486423..240c2a49bc46d 100644 --- a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll +++ b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/CodeGen/20101030-Overflow.ll b/polly/test/CodeGen/20101030-Overflow.ll index fecdb9d4fed1e..c199f757ebac5 100644 --- a/polly/test/CodeGen/20101030-Overflow.ll +++ b/polly/test/CodeGen/20101030-Overflow.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @compdecomp() nounwind { diff --git a/polly/test/CodeGen/20101103-Overflow3.ll b/polly/test/CodeGen/20101103-Overflow3.ll index f1503e25fcc4c..e8b425f009723 100644 --- a/polly/test/CodeGen/20101103-Overflow3.ll +++ b/polly/test/CodeGen/20101103-Overflow3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @Reflection_coefficients(ptr %r) nounwind { bb20: diff --git a/polly/test/CodeGen/20101103-signmissmatch.ll b/polly/test/CodeGen/20101103-signmissmatch.ll index 3d0c929446f45..0295ee0567208 100644 --- a/polly/test/CodeGen/20101103-signmissmatch.ll +++ b/polly/test/CodeGen/20101103-signmissmatch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @CleanNet() nounwind { diff --git a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll index 0e62e678f0ae2..6913deed23054 100644 --- a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll +++ b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @main() nounwind { diff --git a/polly/test/CodeGen/20110226-PHI-Node-removed.ll b/polly/test/CodeGen/20110226-PHI-Node-removed.ll index 32b018f24e547..a39fced9dbaba 100644 --- a/polly/test/CodeGen/20110226-PHI-Node-removed.ll +++ b/polly/test/CodeGen/20110226-PHI-Node-removed.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/CodeGen/20120316-InvalidCast.ll b/polly/test/CodeGen/20120316-InvalidCast.ll index b87a3dc60deaa..a7f709b4a7615 100644 --- a/polly/test/CodeGen/20120316-InvalidCast.ll +++ b/polly/test/CodeGen/20120316-InvalidCast.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; CHECK: polly.start diff --git a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll index dac78bf04a250..554384c0e777e 100644 --- a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll +++ b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s ; We just check that this compilation does not crash. diff --git a/polly/test/CodeGen/20130221.ll b/polly/test/CodeGen/20130221.ll index 5728a768a3b3b..101930e175634 100644 --- a/polly/test/CodeGen/20130221.ll +++ b/polly/test/CodeGen/20130221.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" define void @list_sequence(ptr %A) { diff --git a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll index cafd68e508255..7ad8cbf963f45 100644 --- a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll +++ b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/Intrinsics/llvm-expect.ll b/polly/test/CodeGen/Intrinsics/llvm-expect.ll index 47fd4f07e4678..ba4ea1565e481 100644 --- a/polly/test/CodeGen/Intrinsics/llvm-expect.ll +++ b/polly/test/CodeGen/Intrinsics/llvm-expect.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; Check that we generate code without crashing. ; diff --git a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll index eb7de01ba862c..a92917f30b724 100644 --- a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll +++ b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll @@ -1,6 +1,6 @@ ; This test checks that we do not accidentally mutate the debug info when ; inserting loop parallel metadata. -; RUN: opt %loadNPMPolly < %s -S -polly -passes=polly-codegen -polly-ast-detect-parallel | FileCheck %s +; RUN: opt %loadNPMPolly -S -polly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel < %s | FileCheck %s ; CHECK-NOT: !7 = !{!7} target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll index 9bb086fa79aed..0d947004aea50 100644 --- a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll +++ b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel -S < %s | FileCheck %s ; ; Check that we mark multiple parallel loops correctly including the memory instructions. ; diff --git a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll index 442600cff7a0a..1293cd91da78d 100644 --- a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll +++ b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=SEQUENTIAL -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=SEQUENTIAL +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; This is a trivially parallel loop. We just use it to ensure that we actually diff --git a/polly/test/CodeGen/MemAccess/bad_alignment.ll b/polly/test/CodeGen/MemAccess/bad_alignment.ll index 82fff27dd0eb7..be1c64938422c 100644 --- a/polly/test/CodeGen/MemAccess/bad_alignment.ll +++ b/polly/test/CodeGen/MemAccess/bad_alignment.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -disable-output 2>&1 < %s | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -disable-output 2>&1 < %s | FileCheck %s ; ; Check that we do not allow to access elements not accessed before because the ; alignment information would become invalid. diff --git a/polly/test/CodeGen/MemAccess/codegen_address_space.ll b/polly/test/CodeGen/MemAccess/codegen_address_space.ll index 3360e10529f8e..283c8fbd2c249 100644 --- a/polly/test/CodeGen/MemAccess/codegen_address_space.ll +++ b/polly/test/CodeGen/MemAccess/codegen_address_space.ll @@ -1,4 +1,4 @@ -;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ;int A[100]; ; diff --git a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll index 0563ca87eef51..ce44f2daceaa9 100644 --- a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll +++ b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll @@ -1,4 +1,4 @@ -;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ;int A[100]; ; diff --git a/polly/test/CodeGen/MemAccess/codegen_simple.ll b/polly/test/CodeGen/MemAccess/codegen_simple.ll index ee0187fe97d25..ab1dca516a9cf 100644 --- a/polly/test/CodeGen/MemAccess/codegen_simple.ll +++ b/polly/test/CodeGen/MemAccess/codegen_simple.ll @@ -1,4 +1,4 @@ -;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ;int A[100]; ; diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll index 6970565bf023e..72f9c2ce61e3c 100644 --- a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll +++ b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll @@ -1,4 +1,4 @@ -;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ;float A[100]; ; diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll index f0896e2bf6093..a6d9969286fc7 100644 --- a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll +++ b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll @@ -1,5 +1,5 @@ -;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s -;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withconst -S < %s | FileCheck -check-prefix=WITHCONST %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withoutconst -S < %s | FileCheck -check-prefix=WITHOUTCONST %s ;int A[1040]; ; diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll index 99fc36996f083..568b0ff4ae20a 100644 --- a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll +++ b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll @@ -1,5 +1,5 @@ -;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s -;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withconst -S < %s | FileCheck -check-prefix=WITHCONST %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withoutconst -S < %s | FileCheck -check-prefix=WITHOUTCONST %s ; ;float A[1040]; ; diff --git a/polly/test/CodeGen/MemAccess/create_arrays.ll b/polly/test/CodeGen/MemAccess/create_arrays.ll index 40ae8d6efa95f..8443e0f7be327 100644 --- a/polly/test/CodeGen/MemAccess/create_arrays.ll +++ b/polly/test/CodeGen/MemAccess/create_arrays.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadPolly -polly-print-scops -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly -polly-print-scops '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; for (i = 0; i < _PB_NI; i++) ; for (j = 0; j < _PB_NJ; j++) diff --git a/polly/test/CodeGen/MemAccess/create_arrays_heap.ll b/polly/test/CodeGen/MemAccess/create_arrays_heap.ll index 1202d21998c94..9c95378a76433 100644 --- a/polly/test/CodeGen/MemAccess/create_arrays_heap.ll +++ b/polly/test/CodeGen/MemAccess/create_arrays_heap.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-scops '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s --check-prefix=CODEGEN ; ; #define Ni 1056 ; #define Nj 1056 diff --git a/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll b/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll index 7d8083cc55846..f08fabd67ef5c 100644 --- a/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll +++ b/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -polly-print-import-jscop -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s ; ; Check that we allow the new access functions even though they access ; different locations than the original ones (but the alignment is the diff --git a/polly/test/CodeGen/MemAccess/different_types.ll b/polly/test/CodeGen/MemAccess/different_types.ll index 407e72702aa86..ae6168d235a96 100644 --- a/polly/test/CodeGen/MemAccess/different_types.ll +++ b/polly/test/CodeGen/MemAccess/different_types.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ -; RUN: \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s ; ; void foo(float A[], float B[]) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/CodeGen/MemAccess/generate-all.ll b/polly/test/CodeGen/MemAccess/generate-all.ll index 7b2286bfc95a9..099a3e0670960 100644 --- a/polly/test/CodeGen/MemAccess/generate-all.ll +++ b/polly/test/CodeGen/MemAccess/generate-all.ll @@ -1,7 +1,5 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=false \ -; RUN: -S < %s | FileCheck %s -check-prefix=SCEV -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=true \ -; RUN: -S < %s | FileCheck %s -check-prefix=ASTEXPR +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-generate-expressions=false -S < %s | FileCheck %s -check-prefix=SCEV +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-generate-expressions=true -S < %s | FileCheck %s -check-prefix=ASTEXPR ; ; void foo(float A[]) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll index 5c926ac638413..d8d0df7009685 100644 --- a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll +++ b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ -; RUN: -polly-invariant-load-hoisting -S \ -; RUN: 2>&1 < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-invariant-load-hoisting -S 2>&1 < %s | FileCheck %s ; Setting new access functions where the base pointer of the array that is newly ; accessed is only loaded within the scop itself caused incorrect code to be diff --git a/polly/test/CodeGen/MemAccess/map_scalar_access.ll b/polly/test/CodeGen/MemAccess/map_scalar_access.ll index 7c845d4a004f4..4ea21b26ce531 100644 --- a/polly/test/CodeGen/MemAccess/map_scalar_access.ll +++ b/polly/test/CodeGen/MemAccess/map_scalar_access.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed -polly-print-import-jscop -disable-output < %s | FileCheck %s -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed -polly-import-jscop -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s --check-prefix=CODEGEN define void @map_scalar_access(ptr noalias nonnull %A) { entry: diff --git a/polly/test/CodeGen/MemAccess/multiple_types.ll b/polly/test/CodeGen/MemAccess/multiple_types.ll index 7848977ce0310..edc3888be364b 100644 --- a/polly/test/CodeGen/MemAccess/multiple_types.ll +++ b/polly/test/CodeGen/MemAccess/multiple_types.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' \ -; RUN: -polly-allow-differing-element-types \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-allow-differing-element-types -S < %s | FileCheck %s ; ; // Check that accessing one array with different types works. ; void multiple_types(char *Short, char *Float, char *Double) { diff --git a/polly/test/CodeGen/MemAccess/simple.ll b/polly/test/CodeGen/MemAccess/simple.ll index 5077e1a1b5a2c..63d66f1c925f7 100644 --- a/polly/test/CodeGen/MemAccess/simple.ll +++ b/polly/test/CodeGen/MemAccess/simple.ll @@ -1,4 +1,4 @@ -;RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -stats < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -stats < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ;int A[100]; diff --git a/polly/test/CodeGen/MemAccess/simple_analyze.ll b/polly/test/CodeGen/MemAccess/simple_analyze.ll index 143651b565aff..f07cb1629ca18 100644 --- a/polly/test/CodeGen/MemAccess/simple_analyze.ll +++ b/polly/test/CodeGen/MemAccess/simple_analyze.ll @@ -1,4 +1,4 @@ -;RUN: opt %loadPolly -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" @A = common global [100 x i32] zeroinitializer, align 4 diff --git a/polly/test/CodeGen/MemAccess/update_access_functions.ll b/polly/test/CodeGen/MemAccess/update_access_functions.ll index 51fa97adb3c37..93f5f186ad6a5 100644 --- a/polly/test/CodeGen/MemAccess/update_access_functions.ll +++ b/polly/test/CodeGen/MemAccess/update_access_functions.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ -; RUN: -polly-import-jscop-postfix=transformed \ -; RUN: < %s -S | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; CHECK-LABEL: polly.stmt.loop1: ; CHECK-NEXT: %3 = mul nsw i64 5, %polly.indvar{{[0-9]*}} diff --git a/polly/test/CodeGen/Metadata/basic_vec_annotate.ll b/polly/test/CodeGen/Metadata/basic_vec_annotate.ll index ebe91636ea3cc..344a6d0990837 100644 --- a/polly/test/CodeGen/Metadata/basic_vec_annotate.ll +++ b/polly/test/CodeGen/Metadata/basic_vec_annotate.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-annotate-metadata-vectorize < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-annotate-metadata-vectorize < %s | FileCheck %s ; Basic verification of vectorize metadata getting added when "-polly-vectorize-metadata" is ; passed. diff --git a/polly/test/CodeGen/OpenMP/alias-metadata.ll b/polly/test/CodeGen/OpenMP/alias-metadata.ll index 121f630789892..541fbdda5a6b9 100644 --- a/polly/test/CodeGen/OpenMP/alias-metadata.ll +++ b/polly/test/CodeGen/OpenMP/alias-metadata.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s ; ; void foo(float *A, float *B) { ; for (long i = 0; i < 1000; i++) diff --git a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll index 7177ae01f0754..657921690c74d 100644 --- a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll +++ b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; Check that we do not crash but generate parallel code ; diff --git a/polly/test/CodeGen/OpenMP/inlineasm.ll b/polly/test/CodeGen/OpenMP/inlineasm.ll index 82a73780886e3..ac6c7070c1abf 100644 --- a/polly/test/CodeGen/OpenMP/inlineasm.ll +++ b/polly/test/CodeGen/OpenMP/inlineasm.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,polly-codegen' -polly-parallel -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>' -polly-parallel -S < %s | FileCheck %s ; llvm.org/PR51960 ; CHECK-LABEL: define internal void @foo_polly_subfn diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll index aba3ae78f7783..08c0cc7fe37f2 100644 --- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll +++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \ -; RUN: -polly-parallel-force -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s ; ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction. ; diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll index 8cf6148a7b44c..8246aaa25b7b2 100644 --- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll +++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \ -; RUN: -polly-parallel-force -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s ; ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction. ; diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll index 823e5cab55ab3..0c5208c77768b 100644 --- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll +++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \ -; RUN: -polly-parallel-force -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s ; ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction but ; not B[0] as it is not needed diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll index 5557839e715ed..fd039e75444b5 100644 --- a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll +++ b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \ -; RUN: -polly-parallel-force -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s ; ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction. ; diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll index a987fac31b743..fe8b8a3a022bc 100644 --- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll +++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; This code has failed the scev based code generation as the scev in the scop ; contains an AddRecExpr of an outer loop. When generating code, we did not diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll index 96c6d900a7a00..d1f48d92e0e75 100644 --- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll +++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; AST: #pragma simd ; AST: #pragma omp parallel for diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll index c4ad665c7b6cf..5b032801c7282 100644 --- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll +++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; The interesting part of this test case is the instruction: ; %tmp = bitcast i8* %call to i64** diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll index 82acba8b3c523..d612faf7b67c5 100644 --- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll +++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=IR ; Make sure we correctly forward the reference to 'A' to the OpenMP subfunction. ; diff --git a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll index aa44658131bba..213cc2635fb6d 100644 --- a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll +++ b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=IR ; ; float A[100]; ; diff --git a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll index 4deab1af0ccf0..fef23f141eaeb 100644 --- a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll +++ b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-parallel '-passes=polly-delicm,polly-codegen' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-parallel '-passes=polly<no-default-opts;delicm>' -S < %s | FileCheck %s ; ; Verify that -polly-parallel can handle mapped scalar MemoryAccesses. ; diff --git a/polly/test/CodeGen/OpenMP/matmul-parallel.ll b/polly/test/CodeGen/OpenMP/matmul-parallel.ll index 43326b29f7ef1..fd8ce87b45ae8 100644 --- a/polly/test/CodeGen/OpenMP/matmul-parallel.ll +++ b/polly/test/CodeGen/OpenMP/matmul-parallel.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,print<polly-ast>' -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s -; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,polly-codegen' -S < %s | FileCheck --check-prefix=CODEGEN %s +; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s +; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly<no-default-opts;opt-isl>' -S < %s | FileCheck --check-prefix=CODEGEN %s ; REQUIRES: asserts ; Parallelization of detected matrix-multiplication. diff --git a/polly/test/CodeGen/OpenMP/new_multidim_access.ll b/polly/test/CodeGen/OpenMP/new_multidim_access.ll index 5faabb4d20c1a..8018acdcb0e6a 100644 --- a/polly/test/CodeGen/OpenMP/new_multidim_access.ll +++ b/polly/test/CodeGen/OpenMP/new_multidim_access.ll @@ -1,10 +1,6 @@ -; RUN: opt %loadPolly -polly-print-import-jscop \ -; RUN: -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s -; RUN: opt %loadPolly -polly-import-jscop \ -; RUN: -polly-codegen -S < %s \ -; RUN: -polly-parallel \ -; RUN: | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S -polly-parallel < %s | FileCheck %s -check-prefix=IR ; void new_multidim_access(long n, long m, float A[][m]) { ; for (long i = 0; i < n; i++) diff --git a/polly/test/CodeGen/OpenMP/recomputed-srem.ll b/polly/test/CodeGen/OpenMP/recomputed-srem.ll index b7b3a44610f32..99069612cd1d4 100644 --- a/polly/test/CodeGen/OpenMP/recomputed-srem.ll +++ b/polly/test/CodeGen/OpenMP/recomputed-srem.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -polly-parallel \ -; RUN: -polly-parallel-force -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-parallel -polly-parallel-force -S < %s | FileCheck %s ; ; Test to verify that we pass %rem96 to the parallel subfunction. ; diff --git a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll index c207f589e4da0..236362a3e23dc 100644 --- a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll +++ b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll @@ -1,17 +1,8 @@ -; RUN: opt %loadNPMPolly -polly-parallel \ -; RUN: -polly-parallel-force -passes=polly-codegen \ -; RUN: -S -verify-dom-info < %s \ -; RUN: | FileCheck %s -check-prefix=IR - -; RUN: opt %loadNPMPolly -polly-parallel \ -; RUN: -polly-parallel-force -passes=polly-codegen -polly-scheduling=runtime \ -; RUN: -S -verify-dom-info < %s \ -; RUN: | FileCheck %s -check-prefix=IR - -; RUN: opt %loadNPMPolly -polly-parallel \ -; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \ -; RUN: -S -verify-dom-info < %s \ -; RUN: | FileCheck %s -check-prefix=LIBOMP-IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR + +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-scheduling=runtime -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR + +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR ; IR: @GOMP_parallel_loop_runtime_start diff --git a/polly/test/CodeGen/OpenMP/reference-other-bb.ll b/polly/test/CodeGen/OpenMP/reference-other-bb.ll index dbfbd9a905086..9925187883173 100644 --- a/polly/test/CodeGen/OpenMP/reference-other-bb.ll +++ b/polly/test/CodeGen/OpenMP/reference-other-bb.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; IR: @foo_polly_subfn target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll index ee43b8aa34a44..3738266b558ed 100644 --- a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll +++ b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; - Test the case where scalar evolution references a loop that is outside diff --git a/polly/test/CodeGen/OpenMP/reference_latest.ll b/polly/test/CodeGen/OpenMP/reference_latest.ll index 7a8cd77bb1571..fb420b06b9afb 100644 --- a/polly/test/CodeGen/OpenMP/reference_latest.ll +++ b/polly/test/CodeGen/OpenMP/reference_latest.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-codegen' -polly-parallel -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;delicm;simplify>' -polly-parallel -S < %s | FileCheck %s ; ; Test that parallel codegen handles scalars mapped to other arrays. ; After mapping "store double %add10" references the array "MemRef2". diff --git a/polly/test/CodeGen/OpenMP/scev-rewriting.ll b/polly/test/CodeGen/OpenMP/scev-rewriting.ll index 9b79f29094482..861a78e4acd7a 100644 --- a/polly/test/CodeGen/OpenMP/scev-rewriting.ll +++ b/polly/test/CodeGen/OpenMP/scev-rewriting.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly < %s -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable -passes=polly-codegen -S | FileCheck %s +; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; CHECK: define internal void @DoStringSort_polly_subfn target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnueabi" diff --git a/polly/test/CodeGen/OpenMP/single_loop.ll b/polly/test/CodeGen/OpenMP/single_loop.ll index e5aee840ade74..5e8a58fadd56c 100644 --- a/polly/test/CodeGen/OpenMP/single_loop.ll +++ b/polly/test/CodeGen/OpenMP/single_loop.ll @@ -1,14 +1,14 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4 -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -S < %s | FileCheck %s -check-prefix=IR-STRIDE4 +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4 +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s -check-prefix=IR-STRIDE4 -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4 +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;codegen>' -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4 ; This extensive test case tests the creation of the full set of OpenMP calls ; as well as the subfunction creation using a trivial loop as example. diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll index c519bfdee7a58..95324793f4fa4 100644 --- a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll +++ b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; #define N 1024 ; float A[N]; diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll index f6dfd62d6bcc1..7334762f84f6c 100644 --- a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll +++ b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll @@ -1,18 +1,8 @@ -; RUN: opt %loadNPMPolly -polly-parallel \ -; RUN: -polly-parallel-force -passes=polly-codegen \ -; RUN: -S -verify-dom-info < %s \ -; RUN: | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR -; RUN: opt %loadNPMPolly -polly-parallel \ -; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \ -; RUN: -S -verify-dom-info < %s \ -; RUN: | FileCheck %s -check-prefix=LIBOMP-IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR -; RUN: opt %loadNPMPolly -polly-parallel \ -; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \ -; RUN: -polly-scheduling=static \ -; RUN: -S -verify-dom-info < %s \ -; RUN: | FileCheck %s -check-prefix=LIBOMP-STATIC-IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-STATIC-IR ; Ensure the scalars are initialized before the OpenMP code is launched. ; diff --git a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll index 934e04461f134..77c1b23a3f76c 100644 --- a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll +++ b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; This test case verifies that we create correct code even if two OpenMP loops ; share common outer variables. diff --git a/polly/test/CodeGen/PHIInExit.ll b/polly/test/CodeGen/PHIInExit.ll index 3e0c9d67d5ca8..39bdac793e8a1 100644 --- a/polly/test/CodeGen/PHIInExit.ll +++ b/polly/test/CodeGen/PHIInExit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" %struct..0__pthread_mutex_s = type { i32, i32, i32, i32, i32, i32, %struct.__pthread_list_t } diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll index ccb0d15cfc3d2..9ec9804d35b0d 100644 --- a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll +++ b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ -; RUN: -polly-codegen-add-debug-printing \ -; RUN: -polly-ignore-aliasing < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-codegen-add-debug-printing -polly-ignore-aliasing < %s | FileCheck %s ; #define N 10 ; void foo(float A[restrict], double B[restrict], char C[restrict], diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll index 4ffb7fd6e4621..736c136eeb67c 100644 --- a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll +++ b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; define void @func(i32 %n, ptr %A) { diff --git a/polly/test/CodeGen/alias-check-multi-dim.ll b/polly/test/CodeGen/alias-check-multi-dim.ll index 0440bda74b391..bab2690bddb17 100644 --- a/polly/test/CodeGen/alias-check-multi-dim.ll +++ b/polly/test/CodeGen/alias-check-multi-dim.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: sext i32 %indvar.init to i64 diff --git a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll index 4186b8521a535..37ec2d5b748af 100644 --- a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll +++ b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -S < %s \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ignore-aliasing -S < %s | FileCheck %s ; ; void manyarrays(float A1[], float A2[], float A3[], float A4[], float A5[], ; float A6[], float A7[], float A8[], float A9[]) { diff --git a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll index 8e1fc3b328355..7fed270cb51dd 100644 --- a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll +++ b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; We have to cast %B to "short *" before we create RTCs. ; diff --git a/polly/test/CodeGen/aliasing_different_pointer_types.ll b/polly/test/CodeGen/aliasing_different_pointer_types.ll index e601c22b978da..5326af339ddac 100644 --- a/polly/test/CodeGen/aliasing_different_pointer_types.ll +++ b/polly/test/CodeGen/aliasing_different_pointer_types.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; Check that we cast the different pointer types correctly before we compare ; them in the RTC's. We use i8* as max pointer type. diff --git a/polly/test/CodeGen/aliasing_multidimensional_access.ll b/polly/test/CodeGen/aliasing_multidimensional_access.ll index e1dae03280a0e..5d0b40d6b59aa 100644 --- a/polly/test/CodeGen/aliasing_multidimensional_access.ll +++ b/polly/test/CodeGen/aliasing_multidimensional_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; Check that we calculate the maximal access into array A correctly and track the overflow state. ; diff --git a/polly/test/CodeGen/aliasing_parametric_simple_1.ll b/polly/test/CodeGen/aliasing_parametric_simple_1.ll index a79ba2532535d..1b7b85835d795 100644 --- a/polly/test/CodeGen/aliasing_parametric_simple_1.ll +++ b/polly/test/CodeGen/aliasing_parametric_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; void jd(int *A, int *B, int c) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/CodeGen/aliasing_parametric_simple_2.ll b/polly/test/CodeGen/aliasing_parametric_simple_2.ll index efe4af1c9e7c5..fa8053ccabbea 100644 --- a/polly/test/CodeGen/aliasing_parametric_simple_2.ll +++ b/polly/test/CodeGen/aliasing_parametric_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; void jd(int *A, int *B, int c) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/CodeGen/aliasing_struct_element.ll b/polly/test/CodeGen/aliasing_struct_element.ll index 3079e58d7daba..4e8570944f6c6 100644 --- a/polly/test/CodeGen/aliasing_struct_element.ll +++ b/polly/test/CodeGen/aliasing_struct_element.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; We should only access (or compute the address of) "the first element" of %S ; as it is a single struct not a struct array. The maximal access to S, thus diff --git a/polly/test/CodeGen/alignment.ll b/polly/test/CodeGen/alignment.ll index e0f6a959476f6..daf7999c8072b 100644 --- a/polly/test/CodeGen/alignment.ll +++ b/polly/test/CodeGen/alignment.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; Check that the special alignment information is kept ; diff --git a/polly/test/CodeGen/annotated_alias_scopes.ll b/polly/test/CodeGen/annotated_alias_scopes.ll index ada03e0663722..7d2d9038270a9 100644 --- a/polly/test/CodeGen/annotated_alias_scopes.ll +++ b/polly/test/CodeGen/annotated_alias_scopes.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=SCOPES +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=SCOPES ; ; Check that we create alias scopes that indicate the accesses to A, B and C cannot alias in any way. ; diff --git a/polly/test/CodeGen/blas_sscal_simplified.ll b/polly/test/CodeGen/blas_sscal_simplified.ll index 99f2eae9dd8e5..461af09b5b289 100644 --- a/polly/test/CodeGen/blas_sscal_simplified.ll +++ b/polly/test/CodeGen/blas_sscal_simplified.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s ; ; Regression test for a bug in the runtime check generation. diff --git a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll index 5dba93373b70b..5eb6076892f3e 100644 --- a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll +++ b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -disable-output < %s ; ; CHECK: store i32 %tmp14_p_scalar_, ptr %tmp14.s2a ; CHECK: %tmp14.final_reload = load i32, ptr %tmp14.s2a diff --git a/polly/test/CodeGen/constant_condition.ll b/polly/test/CodeGen/constant_condition.ll index 905aa52df5080..9d3c5a811b16a 100644 --- a/polly/test/CodeGen/constant_condition.ll +++ b/polly/test/CodeGen/constant_condition.ll @@ -1,4 +1,4 @@ -;RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s ;#include <string.h> ;int A[1]; diff --git a/polly/test/CodeGen/create-conditional-scop.ll b/polly/test/CodeGen/create-conditional-scop.ll index b8c9a81b71a91..d4df48b757d3d 100644 --- a/polly/test/CodeGen/create-conditional-scop.ll +++ b/polly/test/CodeGen/create-conditional-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -verify-loop-info < %s -S | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -verify-loop-info -S < %s | FileCheck %s target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32" diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll index dfef4202391d4..31b5e69ae4c6a 100644 --- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll +++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s ; ; Check we do not crash even though the dead %tmp8 is referenced by a parameter ; and we do not pre-load it (as it is dead). diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll index fcc6764ce9c21..88b844bea5e4e 100644 --- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll +++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s ; ; Check we do not crash even though there is a dead load that is referenced by ; a parameter and we do not pre-load it (as it is dead). diff --git a/polly/test/CodeGen/debug-intrinsics.ll b/polly/test/CodeGen/debug-intrinsics.ll index ed4b81a8e3a3c..f397a4b83d88a 100644 --- a/polly/test/CodeGen/debug-intrinsics.ll +++ b/polly/test/CodeGen/debug-intrinsics.ll @@ -1,10 +1,6 @@ -; RUN: opt %loadNPMPolly \ -; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen -S < %s | \ -; RUN: FileCheck %s +; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -; RUN: opt %loadNPMPolly \ -; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen -S < %s | \ -; RUN: FileCheck %s +; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll index edc03333a358d..7f6f128c2cff2 100644 --- a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll +++ b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s ; ; This caused dominance problems at some point as we do bail out during ; code generation. Just verify it runs through. diff --git a/polly/test/CodeGen/empty_domain_in_context.ll b/polly/test/CodeGen/empty_domain_in_context.ll index a2fe805f402e0..f6c39eb0517bc 100644 --- a/polly/test/CodeGen/empty_domain_in_context.ll +++ b/polly/test/CodeGen/empty_domain_in_context.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-optree,polly-opt-isl,polly-codegen' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree;opt-isl;codegen>' -S < %s | FileCheck %s ; ; llvm.org/PR35362 ; isl codegen does not allow to generate isl_ast_expr from pw_aff which have an diff --git a/polly/test/CodeGen/entry_with_trivial_phi.ll b/polly/test/CodeGen/entry_with_trivial_phi.ll index f2c9da04d6495..09570938a9ca1 100644 --- a/polly/test/CodeGen/entry_with_trivial_phi.ll +++ b/polly/test/CodeGen/entry_with_trivial_phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s ; ; The entry of this scop's simple region (entry.split => for.end) has an trivial ; PHI node. LCSSA may create such PHI nodes. This is a breakdown of this case in diff --git a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll index 2f1ec1a7872aa..7d8ef7acf9435 100644 --- a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll +++ b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; The entry of this scop's simple region (entry.split => for.end) has an trivial ; PHI node that is used in a different of the scop region. LCSSA may create such diff --git a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll index 63b6becd19574..c5c11c8ea2f8f 100644 --- a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll +++ b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; XFAIL: * ; ; CHECK-LABEL: polly.stmt.if.then: diff --git a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll index abec28894f45b..1e38210c733d9 100644 --- a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll +++ b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/exprModDiv.ll b/polly/test/CodeGen/exprModDiv.ll index c9b419abe3242..b123e90c07882 100644 --- a/polly/test/CodeGen/exprModDiv.ll +++ b/polly/test/CodeGen/exprModDiv.ll @@ -1,8 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ -; RUN: -S < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ -; RUN: -polly-import-jscop-postfix=pow2 \ -; RUN: -S < %s | FileCheck %s -check-prefix=POW2 +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=pow2 -S < %s | FileCheck %s -check-prefix=POW2 ; ; void exprModDiv(float *A, float *B, float *C, long N, long p) { ; for (long i = 0; i < N; i++) diff --git a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll index 1ca2413fd5e19..c7873baeeaeb7 100644 --- a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll +++ b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll @@ -1,7 +1,5 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen \ -; RUN: -polly-invariant-load-hoisting=false < %s | FileCheck %s -; RUN: opt %loadNPMPolly -S -passes=polly-codegen \ -; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=false < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; Check that we generate valid code even if the load of cont_STACKPOINTER is ; hoisted in one SCoP and used (through the phi node %tmp2). diff --git a/polly/test/CodeGen/hoisting_1.ll b/polly/test/CodeGen/hoisting_1.ll index aa29bfd7dbcbc..31ae969cd3156 100644 --- a/polly/test/CodeGen/hoisting_1.ll +++ b/polly/test/CodeGen/hoisting_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -polly-allow-differing-element-types -disable-output %s ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/hoisting_2.ll b/polly/test/CodeGen/hoisting_2.ll index 1b913f2cb07be..eb6f7ae5ff6d1 100644 --- a/polly/test/CodeGen/hoisting_2.ll +++ b/polly/test/CodeGen/hoisting_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -polly-allow-differing-element-types -disable-output %s ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/inner_scev_sdiv_1.ll b/polly/test/CodeGen/inner_scev_sdiv_1.ll index d210105c46baf..f7595a6afb0be 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_1.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s ; ; Excerpt from the test-suite's oggenc reduced using bugpoint. ; diff --git a/polly/test/CodeGen/inner_scev_sdiv_2.ll b/polly/test/CodeGen/inner_scev_sdiv_2.ll index 33233fe2fdf17..247c102834b25 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_2.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; The SCEV expression in this test case refers to a sequence of sdiv ; instructions, which are part of different bbs in the SCoP. When code diff --git a/polly/test/CodeGen/inner_scev_sdiv_3.ll b/polly/test/CodeGen/inner_scev_sdiv_3.ll index a8c626347efe9..fc1cce41c0f4e 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_3.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; This test case has a inner SCEV sdiv that will escape the SCoP. Just check we ; do not crash and generate valid code. diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll index 31c14e85f253e..1ff598a4a021a 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=CODEGEN ; ; CHECK: [N] -> { Stmt_bb11[i0, i1] : i0 < N and i1 >= 0 and 3i1 <= -3 + i0 }; ; CODEGEN: polly diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll index b42371b0891e6..4cd146ddbf62e 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen \ -; RUN: < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; Check that this will not crash our code generation. ; diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll index 45af63402c986..586875bbefcbe 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; This will just check that we generate valid code here. ; diff --git a/polly/test/CodeGen/intrinsics_lifetime.ll b/polly/test/CodeGen/intrinsics_lifetime.ll index a708548798ebb..0f35664eb7e1c 100644 --- a/polly/test/CodeGen/intrinsics_lifetime.ll +++ b/polly/test/CodeGen/intrinsics_lifetime.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; Verify that we remove the lifetime markers from everywhere. ; diff --git a/polly/test/CodeGen/intrinsics_misc.ll b/polly/test/CodeGen/intrinsics_misc.ll index a643b8accd4e9..4a64c1a641182 100644 --- a/polly/test/CodeGen/intrinsics_misc.ll +++ b/polly/test/CodeGen/intrinsics_misc.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; Verify that we remove the misc intrinsics from the optimized SCoP. ; diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll index e7cbf748bea73..15fe0d9e22416 100644 --- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll +++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ -; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; This crashed our codegen at some point, verify it runs through ; diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll index 24e9240c234d1..c1ab026e97701 100644 --- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll +++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ -; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; This crashed our codegen at some point, verify it runs through ; diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll index d1d861e316ee4..f0c833ce1bce1 100644 --- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll +++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ -; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; This crashed our codegen at some point, verify it runs through ; diff --git a/polly/test/CodeGen/invariant-load-dimension.ll b/polly/test/CodeGen/invariant-load-dimension.ll index 21e53055c56b0..13576b9f40455 100644 --- a/polly/test/CodeGen/invariant-load-dimension.ll +++ b/polly/test/CodeGen/invariant-load-dimension.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-invariant-load-hoisting '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS -; RUN: opt %loadNPMPolly -S < %s -passes=polly-codegen -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-invariant-load-hoisting '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-process-unprofitable -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODEGEN target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" diff --git a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll index 1fd9cb81771c6..d92d97012b33c 100644 --- a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll +++ b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s ; ; Check that we generate valid code as we did non preload the base pointer ; origin of %tmp4 at some point. diff --git a/polly/test/CodeGen/invariant_cannot_handle_void.ll b/polly/test/CodeGen/invariant_cannot_handle_void.ll index 420cb608f9ba4..f6dcac08dffca 100644 --- a/polly/test/CodeGen/invariant_cannot_handle_void.ll +++ b/polly/test/CodeGen/invariant_cannot_handle_void.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true %s | FileCheck %s ; ; The offset of the %tmp1 load wrt. to %buff (62 bytes) is not divisible ; by the type size (i32 = 4 bytes), thus we will have to represent %buff diff --git a/polly/test/CodeGen/invariant_load.ll b/polly/test/CodeGen/invariant_load.ll index 2d5e6042ea6a4..c89da73efc839 100644 --- a/polly/test/CodeGen/invariant_load.ll +++ b/polly/test/CodeGen/invariant_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK-NEXT: %polly.access.B = getelementptr i32, ptr %B, i64 0 diff --git a/polly/test/CodeGen/invariant_load_address_space.ll b/polly/test/CodeGen/invariant_load_address_space.ll index 3d1958e5b8a43..7d5139cc55f88 100644 --- a/polly/test/CodeGen/invariant_load_address_space.ll +++ b/polly/test/CodeGen/invariant_load_address_space.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK-NEXT: %polly.access.B = getelementptr i32, ptr addrspace(1) %B, i64 0 diff --git a/polly/test/CodeGen/invariant_load_alias_metadata.ll b/polly/test/CodeGen/invariant_load_alias_metadata.ll index 252463384a5c8..2a704ee9c576a 100644 --- a/polly/test/CodeGen/invariant_load_alias_metadata.ll +++ b/polly/test/CodeGen/invariant_load_alias_metadata.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; This test case checks whether Polly generates alias metadata in case of ; the ublas gemm kernel and polly-invariant-load-hoisting. diff --git a/polly/test/CodeGen/invariant_load_base_pointer.ll b/polly/test/CodeGen/invariant_load_base_pointer.ll index d4ac433475f05..f6b873994036c 100644 --- a/polly/test/CodeGen/invariant_load_base_pointer.ll +++ b/polly/test/CodeGen/invariant_load_base_pointer.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK-NEXT: %polly.access.BPLoc = getelementptr ptr, ptr %BPLoc, i64 0 diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll index 06a9a93363ed9..4dbcc3b3b049d 100644 --- a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll +++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK-NEXT: %0 = sext i32 %N to i64 diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll index 66ab9a31b1032..39520c8fd8217 100644 --- a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll +++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR -; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA ; ; As (p + q) can overflow we have to check that we load from ; I[p + q] only if it does not. diff --git a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll index fa904e9b96d34..414ca127a251f 100644 --- a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll +++ b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s \ -; RUN: -polly-invariant-load-hoisting \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting < %s | FileCheck %s ; CHECK: %polly.access.A = getelementptr ptr, ptr %A, i64 0 ; CHECK: %polly.access.A.load = load ptr, ptr %polly.access.A diff --git a/polly/test/CodeGen/invariant_load_condition.ll b/polly/test/CodeGen/invariant_load_condition.ll index 36e588329d669..f0782c023378b 100644 --- a/polly/test/CodeGen/invariant_load_condition.ll +++ b/polly/test/CodeGen/invariant_load_condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK-NEXT: %polly.access.C = getelementptr i32, ptr %C, i64 0 diff --git a/polly/test/CodeGen/invariant_load_different_sized_types.ll b/polly/test/CodeGen/invariant_load_different_sized_types.ll index 0a88bb70966d2..034c3587a0708 100644 --- a/polly/test/CodeGen/invariant_load_different_sized_types.ll +++ b/polly/test/CodeGen/invariant_load_different_sized_types.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S \ -; RUN: -polly-allow-differing-element-types < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S -polly-allow-differing-element-types < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/invariant_load_escaping.ll b/polly/test/CodeGen/invariant_load_escaping.ll index 416148b72303b..85578d3ba0992 100644 --- a/polly/test/CodeGen/invariant_load_escaping.ll +++ b/polly/test/CodeGen/invariant_load_escaping.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; int f(int *A, int *B) { ; // Possible aliasing between A and B but if not then *B would be diff --git a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll index 906bfc1805d39..ff6e9a8e3ddae 100644 --- a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll +++ b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s ; ; void fence(void); ; diff --git a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll index ab02e639f0d2a..edd38cab2afba 100644 --- a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll +++ b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; This crashed at some point as the invariant load is in a non-affine ; subregion. Just check it does not anymore. diff --git a/polly/test/CodeGen/invariant_load_loop_ub.ll b/polly/test/CodeGen/invariant_load_loop_ub.ll index 1db27ad8e58ba..923102440c547 100644 --- a/polly/test/CodeGen/invariant_load_loop_ub.ll +++ b/polly/test/CodeGen/invariant_load_loop_ub.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s ; ; CHECK: polly.start ; diff --git a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll index 5a11adcdebbc5..0e381b863fb8b 100644 --- a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll +++ b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s ; ; Check that this does not crash as the invariant load is not executed (thus ; not preloaded) but still referenced by one of the parameters. diff --git a/polly/test/CodeGen/invariant_load_outermost.ll b/polly/test/CodeGen/invariant_load_outermost.ll index 7e0550fb3be94..bbbe1f1663964 100644 --- a/polly/test/CodeGen/invariant_load_outermost.ll +++ b/polly/test/CodeGen/invariant_load_outermost.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; CHECK: polly.start diff --git a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll index abf957b556daa..9fe343f752d14 100644 --- a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll +++ b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; SCOP: Assumed Context: ; SCOP-NEXT: [p_0, tmp4] -> { : } diff --git a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll index b565f1bd5096a..dc1c2bca4b6e3 100644 --- a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll +++ b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK: %polly.access.A = getelementptr ptr, ptr %A, i64 42 diff --git a/polly/test/CodeGen/invariant_load_scalar_dep.ll b/polly/test/CodeGen/invariant_load_scalar_dep.ll index ba2999e27984d..bb60c50b1ab40 100644 --- a/polly/test/CodeGen/invariant_load_scalar_dep.ll +++ b/polly/test/CodeGen/invariant_load_scalar_dep.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK: %polly.access.B = getelementptr i32, ptr %B, i64 0 diff --git a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll index 26c964c9c6a72..87c407e05b972 100644 --- a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll +++ b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; Verify the preloaded %tmp0 is stored and communicated in the same alloca. ; In this case, we do not reload %ncol.load from the scalar stack slot, but diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll index 6bf11d5697bd7..5e2b28c53019e 100644 --- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll +++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s ; ; Check we do not crash even though we pre-load values with different types ; from the same base pointer. diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll index 07ce941522459..20d9f6d40b7d6 100644 --- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll +++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s ; ; Check we do not crash even though we pre-load values with different types ; from the same base pointer. diff --git a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll index 19b30afd33ba7..51f8a55d1a400 100644 --- a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll +++ b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting \ -; RUN: -polly-ignore-parameter-bounds -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting -polly-ignore-parameter-bounds -S < %s | FileCheck %s ; CHECK: polly.preload.begin: ; CHECK-NEXT: %global.load = load i32, ptr @global, align 4, !alias.scope !0, !noalias !3 diff --git a/polly/test/CodeGen/invariant_verify_function_failed.ll b/polly/test/CodeGen/invariant_verify_function_failed.ll index 1dcc175ebb163..432c155fdd3ae 100644 --- a/polly/test/CodeGen/invariant_verify_function_failed.ll +++ b/polly/test/CodeGen/invariant_verify_function_failed.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-print-detect -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; This crashed at some point as the pointer returned by the call ; to @__errno_location is invariant and defined in the SCoP but not diff --git a/polly/test/CodeGen/invariant_verify_function_failed_2.ll b/polly/test/CodeGen/invariant_verify_function_failed_2.ll index 43b3d99e11a2f..65ba2cd993193 100644 --- a/polly/test/CodeGen/invariant_verify_function_failed_2.ll +++ b/polly/test/CodeGen/invariant_verify_function_failed_2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -S '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS -; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true %s | FileCheck %s ; ; Check we generate valid code. diff --git a/polly/test/CodeGen/issue56692.ll b/polly/test/CodeGen/issue56692.ll index 34c4e398e2ac0..5e225d73bdcd3 100644 --- a/polly/test/CodeGen/issue56692.ll +++ b/polly/test/CodeGen/issue56692.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; https://github.com/llvm/llvm-project/issues/56692 ; ; CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}), !dbg ![[OPTLOC:[0-9]+]] diff --git a/polly/test/CodeGen/large-numbers-in-boundary-context.ll b/polly/test/CodeGen/large-numbers-in-boundary-context.ll index b228baf9bdf22..4d55273618df6 100644 --- a/polly/test/CodeGen/large-numbers-in-boundary-context.ll +++ b/polly/test/CodeGen/large-numbers-in-boundary-context.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; XFAIL: * ; ; The boundary context contains a constant that does not fit in 64 bits. Hence, diff --git a/polly/test/CodeGen/load_subset_with_context.ll b/polly/test/CodeGen/load_subset_with_context.ll index ccd4198b9fe85..33b3d3b72225f 100644 --- a/polly/test/CodeGen/load_subset_with_context.ll +++ b/polly/test/CodeGen/load_subset_with_context.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; A load must provide a value for every statement instance. ; Statement instances not in the SCoP's context are irrelevant. diff --git a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll index f43247b3e5057..dc0c5517d7ca5 100644 --- a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll +++ b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/CodeGen/loop_with_condition.ll b/polly/test/CodeGen/loop_with_condition.ll index 49e312404cca8..cf28a4de63f3b 100644 --- a/polly/test/CodeGen/loop_with_condition.ll +++ b/polly/test/CodeGen/loop_with_condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ;#include <string.h> ;#define N 1024 diff --git a/polly/test/CodeGen/loop_with_condition_2.ll b/polly/test/CodeGen/loop_with_condition_2.ll index 8ae38eeeb4982..1d8a8132a79cb 100644 --- a/polly/test/CodeGen/loop_with_condition_2.ll +++ b/polly/test/CodeGen/loop_with_condition_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; Verify that we actually detect this loop as the innermost loop even though ; there is a conditional inside. diff --git a/polly/test/CodeGen/loop_with_condition_ineq.ll b/polly/test/CodeGen/loop_with_condition_ineq.ll index 64019a6090212..c222f67ed7836 100644 --- a/polly/test/CodeGen/loop_with_condition_ineq.ll +++ b/polly/test/CodeGen/loop_with_condition_ineq.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ;#include <string.h> ;#define N 1024 diff --git a/polly/test/CodeGen/loop_with_condition_nested.ll b/polly/test/CodeGen/loop_with_condition_nested.ll index 5dcb51dcb91cd..32256a7344664 100644 --- a/polly/test/CodeGen/loop_with_condition_nested.ll +++ b/polly/test/CodeGen/loop_with_condition_nested.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS ;#include <string.h> diff --git a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll index 26fe4eb82ae49..5d7f67f1f9060 100644 --- a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll +++ b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; Test case to trigger the hard way of creating a unique entering ; edge for the SCoP. It is triggered because the entering edge diff --git a/polly/test/CodeGen/memcpy_annotations.ll b/polly/test/CodeGen/memcpy_annotations.ll index 501aa8fbea4d6..c3ffe4abcddd6 100644 --- a/polly/test/CodeGen/memcpy_annotations.ll +++ b/polly/test/CodeGen/memcpy_annotations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; Verify that @llvm.memcpy does not get a !alias.scope annotation. ; @llvm.memcpy takes two pointers, it is ambiguous to which the diff --git a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll index f63eb18118e77..b084672971855 100644 --- a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll +++ b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly --aa-pipeline= '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; CHECK: polly target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" diff --git a/polly/test/CodeGen/multidim-non-matching-typesize.ll b/polly/test/CodeGen/multidim-non-matching-typesize.ll index 63e43c83ada5f..66a4fdf42bc8e 100644 --- a/polly/test/CodeGen/multidim-non-matching-typesize.ll +++ b/polly/test/CodeGen/multidim-non-matching-typesize.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly --aa-pipeline= '-passes=polly<no-default-opts>' -S < %s | FileCheck %s target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" diff --git a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll index 86b17573caada..d3f8b718889e4 100644 --- a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll +++ b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/CodeGen/multidim_alias_check.ll b/polly/test/CodeGen/multidim_alias_check.ll index 93e34e2fd0fc1..e85d7c9e7785d 100644 --- a/polly/test/CodeGen/multidim_alias_check.ll +++ b/polly/test/CodeGen/multidim_alias_check.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: %polly.access.sext.A = sext i32 %n to i64 diff --git a/polly/test/CodeGen/multiple-codegens.ll b/polly/test/CodeGen/multiple-codegens.ll index a63f8a615ff9e..cb12700bfb561 100644 --- a/polly/test/CodeGen/multiple-codegens.ll +++ b/polly/test/CodeGen/multiple-codegens.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen,polly-codegen)" -S < %s | FileCheck %s -; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen),scop(polly-codegen)" -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>,polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=function(polly<no-default-opts;opt-isl>),function(polly<no-default-opts>)' -S < %s | FileCheck %s ; ; llvm.org/PR34441 ; Properly handle multiple -polly-scops/-polly-codegen in the same diff --git a/polly/test/CodeGen/multiple-scops-in-a-row.ll b/polly/test/CodeGen/multiple-scops-in-a-row.ll index effae223c152a..b92359782d999 100644 --- a/polly/test/CodeGen/multiple-scops-in-a-row.ll +++ b/polly/test/CodeGen/multiple-scops-in-a-row.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; This test case has two scops in a row. When code generating the first scop, ; the second scop is invalidated. This test case verifies that we do not crash diff --git a/polly/test/CodeGen/multiple-types-invariant-load-2.ll b/polly/test/CodeGen/multiple-types-invariant-load-2.ll index 101fcaff0c82e..96615079be365 100644 --- a/polly/test/CodeGen/multiple-types-invariant-load-2.ll +++ b/polly/test/CodeGen/multiple-types-invariant-load-2.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ -; RUN: -polly-allow-differing-element-types < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-allow-differing-element-types < %s | FileCheck %s ; CHECK: polly diff --git a/polly/test/CodeGen/multiple-types-invariant-load.ll b/polly/test/CodeGen/multiple-types-invariant-load.ll index 930041eaddaad..ca89cb53e09b7 100644 --- a/polly/test/CodeGen/multiple-types-invariant-load.ll +++ b/polly/test/CodeGen/multiple-types-invariant-load.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-differing-element-types -passes=polly-codegen -S \ -; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s ; CHECK: %polly.access.global.load = getelementptr i32, ptr %global.load, i64 0 ; CHECK: %polly.access.global.load.load = load i32, ptr %polly.access.global.load diff --git a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll index 1e06a7e186bb0..8198108b22059 100644 --- a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll +++ b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadNPMPolly -polly-position=before-vectorizer -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR ; The IR has two ScopArrayInfo for the value %next.0. This used to produce two ; phi nodes in polly.merge_new_and_old, one illegaly using the result of the diff --git a/polly/test/CodeGen/no-overflow-tracking.ll b/polly/test/CodeGen/no-overflow-tracking.ll index d5ad9a7aef239..f915b5a0772e6 100644 --- a/polly/test/CodeGen/no-overflow-tracking.ll +++ b/polly/test/CodeGen/no-overflow-tracking.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR ; ; As (p + q) can overflow we have to check that we load from ; I[p + q] only if it does not. diff --git a/polly/test/CodeGen/no_guard_bb.ll b/polly/test/CodeGen/no_guard_bb.ll index a022083f43a9e..604c5ac54bcdb 100644 --- a/polly/test/CodeGen/no_guard_bb.ll +++ b/polly/test/CodeGen/no_guard_bb.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s ; ; CHECK-NOT: br i1 true, label %polly.{{.*}}, label %polly.{{.*}} ; diff --git a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll index 6015516a3bc49..ebb02a90ffb5d 100644 --- a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll +++ b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; llvm.org/PR25439 ; Scalar reloads in the generated entering block were not recognized as diff --git a/polly/test/CodeGen/non-affine-exit-node-dominance.ll b/polly/test/CodeGen/non-affine-exit-node-dominance.ll index 0d0f634ed7c16..ff9f504295672 100644 --- a/polly/test/CodeGen/non-affine-exit-node-dominance.ll +++ b/polly/test/CodeGen/non-affine-exit-node-dominance.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; llvm.org/PR25439 ; The dominance of the generated non-affine subregion block was based on the diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll index bfa3c156ea75d..2ad1e75216362 100644 --- a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll +++ b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll index b9386333a79b4..386fe5f9f207f 100644 --- a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll +++ b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s define void @foo(ptr %A, i1 %cond0, i1 %cond1) { entry: diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll index 6460c427270f4..5e5f34d99bde3 100644 --- a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll +++ b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s define void @foo(ptr %A, i1 %cond0, i1 %cond1) { entry: diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion.ll b/polly/test/CodeGen/non-affine-phi-node-expansion.ll index 1b6802f1a4c35..db9f0d518041b 100644 --- a/polly/test/CodeGen/non-affine-phi-node-expansion.ll +++ b/polly/test/CodeGen/non-affine-phi-node-expansion.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.wombat = type {[4 x i32]} diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll index 007a4c586aa32..096eb8609e1bb 100644 --- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll +++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; This caused the code generation to generate invalid code as the same operand ; of the PHI node in the non-affine region was synthesized at the wrong place. diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll index 20edbf2bd6c03..2810a8ab5361f 100644 --- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll +++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; This caused the code generation to generate invalid code as the same BBMap was ; used for the whole non-affine region. When %add is synthesized for the diff --git a/polly/test/CodeGen/non-affine-region-implicit-store.ll b/polly/test/CodeGen/non-affine-region-implicit-store.ll index 0ff39d3fe882d..cdb2000d90d6b 100644 --- a/polly/test/CodeGen/non-affine-region-implicit-store.ll +++ b/polly/test/CodeGen/non-affine-region-implicit-store.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; llvm.org/PR25438 ; After loop versioning, a dominance check of a non-affine subregion's exit node diff --git a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll index 7df3d8976ea80..b4889c76079cc 100644 --- a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll +++ b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-loops -S < %s | FileCheck %s ; This test verifies that values defined in another scop statement and used by ; PHI-nodes in non-affine regions are code generated correctly. diff --git a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll index 179062dd62d0a..45465c627f55a 100644 --- a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll +++ b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info \ -; RUN: < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s ; ; Check that we do not reuse the B[i-1] GEP created in block S again in ; block Q. Hence, we create two GEPs for B[i-1]: diff --git a/polly/test/CodeGen/non-affine-switch.ll b/polly/test/CodeGen/non-affine-switch.ll index 427e7e2461f1d..90d5efdc3a9f5 100644 --- a/polly/test/CodeGen/non-affine-switch.ll +++ b/polly/test/CodeGen/non-affine-switch.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly \ -; RUN: -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll index 292c0f2b53941..5bb4fd19f4fd1 100644 --- a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll +++ b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; llvm.org/PR25412 ; %synthgep caused %gep to be synthesized in subregion_if which was reused for diff --git a/polly/test/CodeGen/non-affine-update.ll b/polly/test/CodeGen/non-affine-update.ll index 03f091a405017..582607787eb7d 100644 --- a/polly/test/CodeGen/non-affine-update.ll +++ b/polly/test/CodeGen/non-affine-update.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s ; ; void non-affine-update(double A[], double C[], double B[]) { ; for (int i = 0; i < 10; i++) { diff --git a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll index 153cdb7ed9f6c..eaf74d9c63e0e 100644 --- a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll +++ b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -disable-output %s ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/non_affine_float_compare.ll b/polly/test/CodeGen/non_affine_float_compare.ll index a359b662e6579..9709e231a4e86 100644 --- a/polly/test/CodeGen/non_affine_float_compare.ll +++ b/polly/test/CodeGen/non_affine_float_compare.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen \ -; RUN: -polly-allow-nonaffine-branches -S -verify-dom-info \ -; RUN: < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-branches -S -verify-dom-info < %s | FileCheck %s ; ; void f(float *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/CodeGen/only_non_affine_error_region.ll b/polly/test/CodeGen/only_non_affine_error_region.ll index 445cef0d6f697..be7a8a23df869 100644 --- a/polly/test/CodeGen/only_non_affine_error_region.ll +++ b/polly/test/CodeGen/only_non_affine_error_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; CHECK-NOT: polly.start ; diff --git a/polly/test/CodeGen/openmp_limit_threads.ll b/polly/test/CodeGen/openmp_limit_threads.ll index 4c33be3407251..730c57299d569 100644 --- a/polly/test/CodeGen/openmp_limit_threads.ll +++ b/polly/test/CodeGen/openmp_limit_threads.ll @@ -1,10 +1,10 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR ; Ensure that the provided thread numbers are forwarded to the OpenMP calls. ; diff --git a/polly/test/CodeGen/out-of-scop-phi-node-use.ll b/polly/test/CodeGen/out-of-scop-phi-node-use.ll index dd0a24b14a3b8..8d5f74751af49 100644 --- a/polly/test/CodeGen/out-of-scop-phi-node-use.ll +++ b/polly/test/CodeGen/out-of-scop-phi-node-use.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/CodeGen/param_div_div_div_2.ll b/polly/test/CodeGen/param_div_div_div_2.ll index 8eba6444abb16..3ae95020d52dd 100644 --- a/polly/test/CodeGen/param_div_div_div_2.ll +++ b/polly/test/CodeGen/param_div_div_div_2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR ; ; Check that we guard the divisions because we moved them and thereby increased ; their domain. diff --git a/polly/test/CodeGen/partial_write_array.ll b/polly/test/CodeGen/partial_write_array.ll index fad4b21cf3dc8..fe5fd8cffece7 100644 --- a/polly/test/CodeGen/partial_write_array.ll +++ b/polly/test/CodeGen/partial_write_array.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; Partial write of an array access. ; diff --git a/polly/test/CodeGen/partial_write_emptyset.ll b/polly/test/CodeGen/partial_write_emptyset.ll index 67828808e2fac..d0e5615e4220d 100644 --- a/polly/test/CodeGen/partial_write_emptyset.ll +++ b/polly/test/CodeGen/partial_write_emptyset.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; Partial write, where "partial" is the empty set. ; The store is never executed in this case and we do generate it in the diff --git a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll index b26bd81b5663b..a36414297485a 100644 --- a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll +++ b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; CHECK: polly.stmt.if.then81: ; preds = %polly.stmt.if.end75 ; CHECK-NEXT: store float undef, ptr %fX64, align 4, !alias.scope !0, !noalias !3 diff --git a/polly/test/CodeGen/partial_write_impossible_restriction.ll b/polly/test/CodeGen/partial_write_impossible_restriction.ll index 7577b137a2750..e0069ebc8eae8 100644 --- a/polly/test/CodeGen/partial_write_impossible_restriction.ll +++ b/polly/test/CodeGen/partial_write_impossible_restriction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; The isl scheduler isolates %cond.false into two instances. ; A partial write access in one of the instances was never executed, diff --git a/polly/test/CodeGen/partial_write_in_region.ll b/polly/test/CodeGen/partial_write_in_region.ll index 7c138c82091e5..e7f4225cf9310 100644 --- a/polly/test/CodeGen/partial_write_in_region.ll +++ b/polly/test/CodeGen/partial_write_in_region.ll @@ -1,7 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ -; RUN: -polly-import-jscop-postfix=transformed \ -; RUN: -verify-dom-info \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -verify-dom-info -S < %s | FileCheck %s ; ; void foo(long A[], float B[], float C[]) { ; for (long i = 0; i < 1024; i++) { diff --git a/polly/test/CodeGen/partial_write_in_region_with_loop.ll b/polly/test/CodeGen/partial_write_in_region_with_loop.ll index ba15a7871f431..85b56fefad809 100644 --- a/polly/test/CodeGen/partial_write_in_region_with_loop.ll +++ b/polly/test/CodeGen/partial_write_in_region_with_loop.ll @@ -1,7 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ -; RUN: -polly-import-jscop-postfix=transformed \ -; RUN: -verify-dom-info -polly-allow-nonaffine-loops \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -verify-dom-info -polly-allow-nonaffine-loops -S < %s | FileCheck %s ; This test verifies that partial writes within non-affine loops are code ; generated correctly. diff --git a/polly/test/CodeGen/partial_write_mapped_scalar.ll b/polly/test/CodeGen/partial_write_mapped_scalar.ll index b8c413885cdb0..bb99d4ea086d2 100644 --- a/polly/test/CodeGen/partial_write_mapped_scalar.ll +++ b/polly/test/CodeGen/partial_write_mapped_scalar.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; Partial write of a (mapped) scalar. ; diff --git a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll index 8c1953a05ad3c..37a9d98c6a22e 100644 --- a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll +++ b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; Partial write of a (mapped) scalar in a non-affine subregion. ; diff --git a/polly/test/CodeGen/perf_monitoring.ll b/polly/test/CodeGen/perf_monitoring.ll index 4b91e5055c0b1..61f122228c377 100644 --- a/polly/test/CodeGen/perf_monitoring.ll +++ b/polly/test/CodeGen/perf_monitoring.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll index d5c33d64f3418..4c47a12c12904 100644 --- a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll +++ b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll index ab99c4d2de062..6d09d8bf27ebe 100644 --- a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll +++ b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \ -; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/phi-defined-before-scop.ll b/polly/test/CodeGen/phi-defined-before-scop.ll index 447a14e9999c2..2ccd7965bbeaf 100644 --- a/polly/test/CodeGen/phi-defined-before-scop.ll +++ b/polly/test/CodeGen/phi-defined-before-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; CHECK-LABEL: polly.merge_new_and_old: ; CHECK-NEXT: %tmp7.ph.merge = phi ptr [ %tmp7.ph.final_reload, %polly.exiting ], [ %tmp7.ph, %bb6.region_exiting ] diff --git a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll index e096aa2f4f8c0..1655104b08390 100644 --- a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll +++ b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; Make sure code generation does not break in case an 'error block' is detected ; outside of the scope. In this situation, we should not affect code generation. diff --git a/polly/test/CodeGen/phi_condition_modeling_1.ll b/polly/test/CodeGen/phi_condition_modeling_1.ll index 9d73d8a792558..1cadac0a5cf73 100644 --- a/polly/test/CodeGen/phi_condition_modeling_1.ll +++ b/polly/test/CodeGen/phi_condition_modeling_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; void f(int *A, int c, int N) { ; int tmp; diff --git a/polly/test/CodeGen/phi_condition_modeling_2.ll b/polly/test/CodeGen/phi_condition_modeling_2.ll index 2d1364842d735..8f2e2a517c96c 100644 --- a/polly/test/CodeGen/phi_condition_modeling_2.ll +++ b/polly/test/CodeGen/phi_condition_modeling_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; void f(int *A, int c, int N) { ; int tmp; diff --git a/polly/test/CodeGen/phi_conditional_simple_1.ll b/polly/test/CodeGen/phi_conditional_simple_1.ll index 25bcf2a118ef4..5f0f8de19f223 100644 --- a/polly/test/CodeGen/phi_conditional_simple_1.ll +++ b/polly/test/CodeGen/phi_conditional_simple_1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; void jd(int *A, int c) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll index 43d29b9ec8649..703e55f15c084 100644 --- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll +++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; This caused an lnt crash at some point, just verify it will run through. ; diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll index 9f28024fcfa0a..3d911e0d6a87f 100644 --- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll +++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; This caused an lnt crash at some point, just verify it will run through and ; produce the PHI node in the exit we are looking for. diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll index 73e99ac0f32c5..5f81f52078723 100644 --- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll +++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; This caused an lnt crash at some point, just verify it will run through and ; produce the PHI node in the exit we are looking for. diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll index 6c9bd56a98722..abb86e650ce2a 100644 --- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll +++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; This caused an lnt crash at some point, just verify it will run through and ; produce the PHI node in the exit we are looking for. diff --git a/polly/test/CodeGen/phi_loop_carried_float.ll b/polly/test/CodeGen/phi_loop_carried_float.ll index 4cb392d3353d3..47a8a8190c8d9 100644 --- a/polly/test/CodeGen/phi_loop_carried_float.ll +++ b/polly/test/CodeGen/phi_loop_carried_float.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; float f(float *A, int N) { ; float tmp = 0; diff --git a/polly/test/CodeGen/phi_loop_carried_float_escape.ll b/polly/test/CodeGen/phi_loop_carried_float_escape.ll index 9fd8ad413128a..81dd5cecd1878 100644 --- a/polly/test/CodeGen/phi_loop_carried_float_escape.ll +++ b/polly/test/CodeGen/phi_loop_carried_float_escape.ll @@ -1,8 +1,6 @@ -; RUN: opt %loadNPMPolly -S \ -; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' < %s | FileCheck %s -; RUN: opt %loadNPMPolly -S \ -; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; float f(float *A, int N) { ; float tmp = 0; diff --git a/polly/test/CodeGen/phi_scalar_simple_1.ll b/polly/test/CodeGen/phi_scalar_simple_1.ll index 80a1c41b83ac0..6331c24da31b0 100644 --- a/polly/test/CodeGen/phi_scalar_simple_1.ll +++ b/polly/test/CodeGen/phi_scalar_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; int jd(int *restrict A, int x, int N) { ; for (int i = 1; i < N; i++) diff --git a/polly/test/CodeGen/phi_scalar_simple_2.ll b/polly/test/CodeGen/phi_scalar_simple_2.ll index 614c8acfb9f8e..0adadf6b90159 100644 --- a/polly/test/CodeGen/phi_scalar_simple_2.ll +++ b/polly/test/CodeGen/phi_scalar_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; int jd(int *restrict A, int x, int N, int c) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll index 7e21666f1db00..4d6ede638c8f2 100644 --- a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll +++ b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; CHECK: polly.merge_new_and_old: ; CHECK: %result.ph.merge = phi float [ %result.ph.final_reload, %polly.exiting ], [ %result.ph, %next.region_exiting ] diff --git a/polly/test/CodeGen/phi_with_one_exit_edge.ll b/polly/test/CodeGen/phi_with_one_exit_edge.ll index 36a8684dbc37a..4de24fb058c26 100644 --- a/polly/test/CodeGen/phi_with_one_exit_edge.ll +++ b/polly/test/CodeGen/phi_with_one_exit_edge.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; ; CHECK: polly.merge_new_and_old: diff --git a/polly/test/CodeGen/pointer-type-expressions-2.ll b/polly/test/CodeGen/pointer-type-expressions-2.ll index 918e4c6c9c0b0..706b01d7f8ca5 100644 --- a/polly/test/CodeGen/pointer-type-expressions-2.ll +++ b/polly/test/CodeGen/pointer-type-expressions-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define void @foo(ptr %start, ptr %end) { diff --git a/polly/test/CodeGen/pointer-type-expressions.ll b/polly/test/CodeGen/pointer-type-expressions.ll index e7feebc163d4b..2478e2238fd0e 100644 --- a/polly/test/CodeGen/pointer-type-expressions.ll +++ b/polly/test/CodeGen/pointer-type-expressions.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN ; void f(int a[], int N, float *P) { ; int i; diff --git a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll index 9ee050a1e5070..cac6f4fdd16f1 100644 --- a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll +++ b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN ; ; void f(int a[], int N, float *P, float *Q) { diff --git a/polly/test/CodeGen/pointer_rem.ll b/polly/test/CodeGen/pointer_rem.ll index b8202318a3eca..ca5d866ae6cce 100644 --- a/polly/test/CodeGen/pointer_rem.ll +++ b/polly/test/CodeGen/pointer_rem.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(print<polly-ast>)' -disable-output -S < %s | FileCheck %s --check-prefix=AST -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(polly-codegen)' -S < %s | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<ast>' -polly-print-scops -polly-print-ast -disable-output -S < %s | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<codegen>' -polly-print-scops -S < %s | FileCheck %s --check-prefix=CODEGEN target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128" target triple = "aarch64--linux-gnu" diff --git a/polly/test/CodeGen/pr25241.ll b/polly/test/CodeGen/pr25241.ll index 7547b0bbed749..94be6d7824921 100644 --- a/polly/test/CodeGen/pr25241.ll +++ b/polly/test/CodeGen/pr25241.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; PR25241 (https://llvm.org/bugs/show_bug.cgi?id=25241) ; Ensure that synthesized values of a PHI node argument are generated in the diff --git a/polly/test/CodeGen/ptrtoint_as_parameter.ll b/polly/test/CodeGen/ptrtoint_as_parameter.ll index a551d810c0802..49a8c38309eb2 100644 --- a/polly/test/CodeGen/ptrtoint_as_parameter.ll +++ b/polly/test/CodeGen/ptrtoint_as_parameter.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; CHECK: if.then260: ; CHECK-NEXT: %p.4 = getelementptr inbounds i8, ptr null, i64 1 diff --git a/polly/test/CodeGen/read-only-scalars.ll b/polly/test/CodeGen/read-only-scalars.ll index 365cbbce495fb..2ae0f9e797bd1 100644 --- a/polly/test/CodeGen/read-only-scalars.ll +++ b/polly/test/CodeGen/read-only-scalars.ll @@ -1,9 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false -passes=polly-codegen \ -; RUN: \ -; RUN: -S < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true -passes=polly-codegen \ -; RUN: \ -; RUN: -S < %s | FileCheck %s -check-prefix=SCALAR +; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=SCALAR ; CHECK-NOT: alloca diff --git a/polly/test/CodeGen/reduction.ll b/polly/test/CodeGen/reduction.ll index 8c5f70770a1c5..21d8c0f98b702 100644 --- a/polly/test/CodeGen/reduction.ll +++ b/polly/test/CodeGen/reduction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | not FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | not FileCheck %s ;#include <string.h> ;#include <stdio.h> diff --git a/polly/test/CodeGen/reduction_2.ll b/polly/test/CodeGen/reduction_2.ll index 060a1866870e4..f9576826b4f77 100644 --- a/polly/test/CodeGen/reduction_2.ll +++ b/polly/test/CodeGen/reduction_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --allow-empty +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --allow-empty ;#include <string.h> ;#include <stdio.h> diff --git a/polly/test/CodeGen/reduction_simple_binary.ll b/polly/test/CodeGen/reduction_simple_binary.ll index 0fe1085dbbacd..53cbdf407c954 100644 --- a/polly/test/CodeGen/reduction_simple_binary.ll +++ b/polly/test/CodeGen/reduction_simple_binary.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: pragma simd reduction ; diff --git a/polly/test/CodeGen/reggen_domtree_crash.ll b/polly/test/CodeGen/reggen_domtree_crash.ll index 58c27091a22c3..9d5ba4c4ff9fb 100644 --- a/polly/test/CodeGen/reggen_domtree_crash.ll +++ b/polly/test/CodeGen/reggen_domtree_crash.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s ; CHECK: define ptr @ham(ptr %arg, i64 %arg1, i1 %arg2) diff --git a/polly/test/CodeGen/region-with-instructions.ll b/polly/test/CodeGen/region-with-instructions.ll index e5f7d0f9ef5d6..f061ac061e226 100644 --- a/polly/test/CodeGen/region-with-instructions.ll +++ b/polly/test/CodeGen/region-with-instructions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; CHECK-LABEL: polly.stmt.bb48: ; CHECK-NEXT: %[[offset:.*]] = shl i64 %polly.indvar, 3 diff --git a/polly/test/CodeGen/region_exiting-domtree.ll b/polly/test/CodeGen/region_exiting-domtree.ll index 06e0d9df3d951..16b265c064790 100644 --- a/polly/test/CodeGen/region_exiting-domtree.ll +++ b/polly/test/CodeGen/region_exiting-domtree.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s ; Verify that the DominatorTree is preserved correctly for the inserted ; %polly.stmt.exit.exit block, which serves as new exit block for the generated diff --git a/polly/test/CodeGen/region_multiexit_partialwrite.ll b/polly/test/CodeGen/region_multiexit_partialwrite.ll index 39e04dbf93ac7..9d21d16c9f9cd 100644 --- a/polly/test/CodeGen/region_multiexit_partialwrite.ll +++ b/polly/test/CodeGen/region_multiexit_partialwrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; This text case has a partial write of PHI in a region-statement. It ; requires that the new PHINode from the region's exiting block is diff --git a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll index 4afaab5bbad0a..7984b7ce80209 100644 --- a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll +++ b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; TODO: FIXME: Simplify the context. ; AST: if (n >= 1 && 0 == n <= -1) diff --git a/polly/test/CodeGen/run-time-condition.ll b/polly/test/CodeGen/run-time-condition.ll index 914b76f5e0be7..44d2a4f15b378 100644 --- a/polly/test/CodeGen/run-time-condition.ll +++ b/polly/test/CodeGen/run-time-condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll index 77306c1046133..102ef04128133 100644 --- a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll +++ b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; Test the code generation in the presence of a scalar out-of-scop value being ; used from within the SCoP. diff --git a/polly/test/CodeGen/scalar-store-from-same-bb.ll b/polly/test/CodeGen/scalar-store-from-same-bb.ll index 0c1164b245a43..1988f77086c8a 100644 --- a/polly/test/CodeGen/scalar-store-from-same-bb.ll +++ b/polly/test/CodeGen/scalar-store-from-same-bb.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly \ -; RUN: -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; This test ensures that the expression N + 1 that is stored in the phi-node ; alloca, is directly computed and not incorrectly transferred through memory. diff --git a/polly/test/CodeGen/scalar_codegen_crash.ll b/polly/test/CodeGen/scalar_codegen_crash.ll index 375f097283b07..0179072391a33 100644 --- a/polly/test/CodeGen/scalar_codegen_crash.ll +++ b/polly/test/CodeGen/scalar_codegen_crash.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly \ -; RUN: -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; This test cases used to crash the scalar code generation. Check that we ; can generate code for it. diff --git a/polly/test/CodeGen/scev-backedgetaken.ll b/polly/test/CodeGen/scev-backedgetaken.ll index e0941690ae489..09fcfe3e4a09c 100644 --- a/polly/test/CodeGen/scev-backedgetaken.ll +++ b/polly/test/CodeGen/scev-backedgetaken.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; llvm.org/PR48422 ; Use of ScalarEvolution in Codegen not possible because DominatorTree is not updated. diff --git a/polly/test/CodeGen/scev-division-invariant-load.ll b/polly/test/CodeGen/scev-division-invariant-load.ll index 70f090eae07b3..5942ecbe7cee9 100644 --- a/polly/test/CodeGen/scev-division-invariant-load.ll +++ b/polly/test/CodeGen/scev-division-invariant-load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s ; ; Check that we generate valid code as we did not use the preloaded ; value of %tmp1 for the access function of the preloaded %tmp4. diff --git a/polly/test/CodeGen/scev.ll b/polly/test/CodeGen/scev.ll index e2b5afda1bfff..a09d8c5504b1b 100644 --- a/polly/test/CodeGen/scev.ll +++ b/polly/test/CodeGen/scev.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define fastcc void @f () inlinehint align 2 { diff --git a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll index 0adb0ba7eea81..095c362024a83 100644 --- a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll +++ b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ -; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s ; bugpoint-reduced testcase of MiBench/consumer-lame/quantize-pvt.c from the ; test-suite. diff --git a/polly/test/CodeGen/scev_looking_through_bitcasts.ll b/polly/test/CodeGen/scev_looking_through_bitcasts.ll index 142e83f820fe7..81f4b96d22a37 100644 --- a/polly/test/CodeGen/scev_looking_through_bitcasts.ll +++ b/polly/test/CodeGen/scev_looking_through_bitcasts.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; Scalar write of bitcasted value. Instead of writing %b of type ; %structty, the SCEV expression looks through the bitcast such that diff --git a/polly/test/CodeGen/scop_expander_insert_point.ll b/polly/test/CodeGen/scop_expander_insert_point.ll index fd73132258ddc..1cba7567a5e43 100644 --- a/polly/test/CodeGen/scop_expander_insert_point.ll +++ b/polly/test/CodeGen/scop_expander_insert_point.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ -; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; CHECK: entry: ; CHECK-NEXT: %outvalue.141.phiops = alloca i64 diff --git a/polly/test/CodeGen/scop_expander_segfault.ll b/polly/test/CodeGen/scop_expander_segfault.ll index d94a1fdfb2c12..56d37a0175853 100644 --- a/polly/test/CodeGen/scop_expander_segfault.ll +++ b/polly/test/CodeGen/scop_expander_segfault.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S %s | FileCheck %s ; ; This test was extracted from gcc in SPEC2006 and it crashed our code ; generation, or to be more precise, the ScopExpander due to a endless diff --git a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll index 9f968e5657c90..cdcfe838fa915 100644 --- a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll +++ b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; Verify that we generate the runtime check code after the conditional branch ; in the SCoP region entering block (here %entry). diff --git a/polly/test/CodeGen/select-base-pointer.ll b/polly/test/CodeGen/select-base-pointer.ll index 85be37755c474..144c05b5effba 100644 --- a/polly/test/CodeGen/select-base-pointer.ll +++ b/polly/test/CodeGen/select-base-pointer.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -disable-output %s ; ; Check that we do not crash here. ; diff --git a/polly/test/CodeGen/sequential_loops.ll b/polly/test/CodeGen/sequential_loops.ll index 33a3ee9fbbd47..eeb3048007859 100644 --- a/polly/test/CodeGen/sequential_loops.ll +++ b/polly/test/CodeGen/sequential_loops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ;#include <string.h> ;#define N 1024 diff --git a/polly/test/CodeGen/simple_loop_non_single_exit.ll b/polly/test/CodeGen/simple_loop_non_single_exit.ll index a7e36bc4c7330..1b3518bdb0cba 100644 --- a/polly/test/CodeGen/simple_loop_non_single_exit.ll +++ b/polly/test/CodeGen/simple_loop_non_single_exit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll index 22e9da09ef857..3af9913e6aa04 100644 --- a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll +++ b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/simple_non_single_entry.ll b/polly/test/CodeGen/simple_non_single_entry.ll index c33a77ae07939..8800dc7214b06 100644 --- a/polly/test/CodeGen/simple_non_single_entry.ll +++ b/polly/test/CodeGen/simple_non_single_entry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/simple_nonaffine_loop.ll b/polly/test/CodeGen/simple_nonaffine_loop.ll index bc62047a80a34..5b1cd1991cd73 100644 --- a/polly/test/CodeGen/simple_nonaffine_loop.ll +++ b/polly/test/CodeGen/simple_nonaffine_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-allow-nonaffine -disable-output < %s | FileCheck %s ;#include <stdio.h> ;#include <stdlib.h> diff --git a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll index a65e3a25f035a..f0142f726efa4 100644 --- a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll +++ b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ;#define N 20 ;#include "limits.h" diff --git a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll index acccb48f18a3c..cc5e7b221026c 100644 --- a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll +++ b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; XFAIL: * ;define N 20 diff --git a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll index 7a67f6ba96ce2..1299362369478 100644 --- a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll +++ b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s ;#define N 20 ;#include "limits.h" diff --git a/polly/test/CodeGen/single_do_loop_one_iteration.ll b/polly/test/CodeGen/single_do_loop_one_iteration.ll index 2d939167b71ee..d025ef2116a40 100644 --- a/polly/test/CodeGen/single_do_loop_one_iteration.ll +++ b/polly/test/CodeGen/single_do_loop_one_iteration.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; XFAIL: * ;#define N 20 diff --git a/polly/test/CodeGen/single_do_loop_scev_replace.ll b/polly/test/CodeGen/single_do_loop_scev_replace.ll index 83c9e9d0324ce..b473e266343a3 100644 --- a/polly/test/CodeGen/single_do_loop_scev_replace.ll +++ b/polly/test/CodeGen/single_do_loop_scev_replace.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ;#define N 20 ;#include "limits.h" diff --git a/polly/test/CodeGen/single_loop.ll b/polly/test/CodeGen/single_loop.ll index 2db34663e93ce..c04738e6843a0 100644 --- a/polly/test/CodeGen/single_loop.ll +++ b/polly/test/CodeGen/single_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ;#include <string.h> ;#define N 1024 diff --git a/polly/test/CodeGen/single_loop_int_max_iterations.ll b/polly/test/CodeGen/single_loop_int_max_iterations.ll index f83e8823c63df..82ec7ffd85462 100644 --- a/polly/test/CodeGen/single_loop_int_max_iterations.ll +++ b/polly/test/CodeGen/single_loop_int_max_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ;#define N 20 ;#include "limits.h" diff --git a/polly/test/CodeGen/single_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_loop_ll_max_iterations.ll index 1427189d74a7d..8affb71fad649 100644 --- a/polly/test/CodeGen/single_loop_ll_max_iterations.ll +++ b/polly/test/CodeGen/single_loop_ll_max_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ;#include "limits.h" ;#define N 20 diff --git a/polly/test/CodeGen/single_loop_one_iteration.ll b/polly/test/CodeGen/single_loop_one_iteration.ll index 1a70d4a879d83..307b8358ff980 100644 --- a/polly/test/CodeGen/single_loop_one_iteration.ll +++ b/polly/test/CodeGen/single_loop_one_iteration.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ;#define N 20 ; diff --git a/polly/test/CodeGen/single_loop_param.ll b/polly/test/CodeGen/single_loop_param.ll index 44ce1236e9f84..1d78c7a7329d4 100644 --- a/polly/test/CodeGen/single_loop_param.ll +++ b/polly/test/CodeGen/single_loop_param.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @A = common global [1024 x i32] zeroinitializer, align 16 ; <ptr> [#uses=3] diff --git a/polly/test/CodeGen/single_loop_param_less_equal.ll b/polly/test/CodeGen/single_loop_param_less_equal.ll index fda9bfab11b8f..5fad1d43ae0d7 100644 --- a/polly/test/CodeGen/single_loop_param_less_equal.ll +++ b/polly/test/CodeGen/single_loop_param_less_equal.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN -; RUN: opt %loadNPMPolly -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @A = common global [1024 x i32] zeroinitializer diff --git a/polly/test/CodeGen/single_loop_param_less_than.ll b/polly/test/CodeGen/single_loop_param_less_than.ll index b888c860eacd0..75a8cb2094a16 100644 --- a/polly/test/CodeGen/single_loop_param_less_than.ll +++ b/polly/test/CodeGen/single_loop_param_less_than.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @A = common global [1024 x i32] zeroinitializer diff --git a/polly/test/CodeGen/single_loop_zero_iterations.ll b/polly/test/CodeGen/single_loop_zero_iterations.ll index b1ce491b5c8a2..3194dba52190b 100644 --- a/polly/test/CodeGen/single_loop_zero_iterations.ll +++ b/polly/test/CodeGen/single_loop_zero_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty ;#define N 20 ; diff --git a/polly/test/CodeGen/split_edge_of_exit.ll b/polly/test/CodeGen/split_edge_of_exit.ll index f4b17e687ada6..73d6006a6b621 100644 --- a/polly/test/CodeGen/split_edge_of_exit.ll +++ b/polly/test/CodeGen/split_edge_of_exit.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -disable-output < %s ; ; This is a scop directly precedented by a region, i.e. the scop's entry is the ; region's exit block. This test is to ensure that the RegionInfo is correctly diff --git a/polly/test/CodeGen/split_edges.ll b/polly/test/CodeGen/split_edges.ll index b921202285bb2..03363f49ce800 100644 --- a/polly/test/CodeGen/split_edges.ll +++ b/polly/test/CodeGen/split_edges.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -verify-dom-info -S < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @A = common global [1536 x float] zeroinitializer diff --git a/polly/test/CodeGen/split_edges_2.ll b/polly/test/CodeGen/split_edges_2.ll index 8f4d48f5dcb00..59df1618cfd71 100644 --- a/polly/test/CodeGen/split_edges_2.ll +++ b/polly/test/CodeGen/split_edges_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -verify-dom-info -S < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/CodeGen/srem-in-other-bb.ll b/polly/test/CodeGen/srem-in-other-bb.ll index a13a1b6ab98f2..177d86adb9066 100644 --- a/polly/test/CodeGen/srem-in-other-bb.ll +++ b/polly/test/CodeGen/srem-in-other-bb.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ -; RUN: < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; void pos(float *A, long n) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll index b49c4e12fe11a..5a490b68b9a9f 100644 --- a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll +++ b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -verify-dom-info -passes=polly-codegen -S < %s \ -; RUN: -polly-invariant-load-hoisting=true | FileCheck %s +; RUN: opt %loadNPMPolly -verify-dom-info '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; This caused an infinite recursion during invariant load hoisting at some ; point. Check it does not and we add a "false" runtime check. diff --git a/polly/test/CodeGen/stmt_split_no_dependence.ll b/polly/test/CodeGen/stmt_split_no_dependence.ll index bb878cc342af8..d41e4a87bfb65 100644 --- a/polly/test/CodeGen/stmt_split_no_dependence.ll +++ b/polly/test/CodeGen/stmt_split_no_dependence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; CHECK: store i32 %9, ptr %scevgep, align 4, !alias.scope !3, !noalias !6 ; CHECK: store i32 %11, ptr %scevgep4, align 4, !alias.scope !6, !noalias !3 diff --git a/polly/test/CodeGen/switch-in-non-affine-region.ll b/polly/test/CodeGen/switch-in-non-affine-region.ll index 1a9e7081bebdc..6696efca63f02 100644 --- a/polly/test/CodeGen/switch-in-non-affine-region.ll +++ b/polly/test/CodeGen/switch-in-non-affine-region.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly \ -; RUN: -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll index b2a062363eef4..86395f25db1a8 100644 --- a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll +++ b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; Check for the correct written value of a scalar phi write whose value is ; defined within the loop, but its effective value is its last definition when diff --git a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll index 5668063c27c8e..b5172badd76dc 100644 --- a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll +++ b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen -verify-loop-info < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -verify-loop-info < %s | FileCheck %s ; ; Check that we do not crash as described here: http://llvm.org/bugs/show_bug.cgi?id=21167 ; diff --git a/polly/test/CodeGen/test-invalid-operands-for-select.ll b/polly/test/CodeGen/test-invalid-operands-for-select.ll index fdc98fbb4d9e7..39cadc78f7e36 100644 --- a/polly/test/CodeGen/test-invalid-operands-for-select.ll +++ b/polly/test/CodeGen/test-invalid-operands-for-select.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; Check that we do not crash as described here: http://llvm.org/PR21167 ; diff --git a/polly/test/CodeGen/test.ll b/polly/test/CodeGen/test.ll index aad998ba2728b..7c28ca4860e79 100644 --- a/polly/test/CodeGen/test.ll +++ b/polly/test/CodeGen/test.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; XFAIL: * ;int bar1(); diff --git a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll index 1c68389eaeba8..d97a632fc382e 100644 --- a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll +++ b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; CHECK: polly.merge_new_and_old: ; CHECK-NEXT: merge = phi diff --git a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll index 4396c38310dce..845d106d43b0e 100644 --- a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll +++ b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; CHECK-LABEL: for.cond: ; CHECK: %num.0 = phi i32 [ %add, %for.body15 ], [ 0, %for.cond.pre_entry_bb ] diff --git a/polly/test/CodeGen/two-scops-in-row.ll b/polly/test/CodeGen/two-scops-in-row.ll index dd3f310ef1502..4b9d49cb02ec6 100644 --- a/polly/test/CodeGen/two-scops-in-row.ll +++ b/polly/test/CodeGen/two-scops-in-row.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ignore-aliasing -disable-output < %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; SCALAR: if ( diff --git a/polly/test/CodeGen/udiv_expansion_position.ll b/polly/test/CodeGen/udiv_expansion_position.ll index 354e3cd180107..2a3ba8ae45757 100644 --- a/polly/test/CodeGen/udiv_expansion_position.ll +++ b/polly/test/CodeGen/udiv_expansion_position.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; ; Verify we do not crash when we synthezise code for the udiv in the SCoP. ; diff --git a/polly/test/CodeGen/uninitialized_scalar_memory.ll b/polly/test/CodeGen/uninitialized_scalar_memory.ll index e08af07e604e8..ad0e6ca7e350b 100644 --- a/polly/test/CodeGen/uninitialized_scalar_memory.ll +++ b/polly/test/CodeGen/uninitialized_scalar_memory.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s ; ; Verify we initialize the scalar locations reserved for the incoming phi ; values. diff --git a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll index 46706804a81b0..e7f4d601edab5 100644 --- a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll +++ b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll @@ -1,7 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen \ -; RUN: -polly-invariant-load-hoisting=true -disable-output < %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s ; The loop for.body is a scop with invariant load hoisting, but does not ; terminate predictably for ScalarEvolution. The scalar %1 therefore is not diff --git a/polly/test/CodeGen/variant_load_empty_domain.ll b/polly/test/CodeGen/variant_load_empty_domain.ll index 6f2d3dc582db3..d1f4450d086e0 100644 --- a/polly/test/CodeGen/variant_load_empty_domain.ll +++ b/polly/test/CodeGen/variant_load_empty_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s ; ; ; void f(int *A) { diff --git a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll index b342b1cb5aa27..44f6dbcd34d1d 100644 --- a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll +++ b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly \ -; RUN: -passes=polly-codegen -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s ; CHECK: polly.start ; int /* pure */ g() diff --git a/polly/test/DeLICM/confused_order.ll b/polly/test/DeLICM/confused_order.ll index 0c19eb6aa605a..de340ef48d16e 100644 --- a/polly/test/DeLICM/confused_order.ll +++ b/polly/test/DeLICM/confused_order.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-delicm>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-delicm' -polly-import-jscop-postfix=transformed -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS ; ; ForwardOptree changes the SCoP and may already map some accesses. ; DeLICM must be prepared to encounter implicit reads diff --git a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll index 66d9ae889e657..ba42692febab2 100644 --- a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll +++ b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; The domain of bb14 contradicts the SCoP's assumptions. This leads to ; 'anything goes' inside the statement since it is never executed, diff --git a/polly/test/DeLICM/load-in-cond-inf-loop.ll b/polly/test/DeLICM/load-in-cond-inf-loop.ll index a78a4691bb0d5..19cc334f70054 100644 --- a/polly/test/DeLICM/load-in-cond-inf-loop.ll +++ b/polly/test/DeLICM/load-in-cond-inf-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; When %b is 0, %for.body13 is an infinite loop. In this case the loaded ; value %1 is not used anywhere. diff --git a/polly/test/DeLICM/map_memset_zero.ll b/polly/test/DeLICM/map_memset_zero.ll index 9a8e5989fdad1..cc4e0ab387d2a 100644 --- a/polly/test/DeLICM/map_memset_zero.ll +++ b/polly/test/DeLICM/map_memset_zero.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s ; ; Check that PHI mapping works even in presence of a memset whose' ; zero value is used. diff --git a/polly/test/DeLICM/nomap_alreadymapped.ll b/polly/test/DeLICM/nomap_alreadymapped.ll index da5f4ec24a47e..9e49300381b57 100644 --- a/polly/test/DeLICM/nomap_alreadymapped.ll +++ b/polly/test/DeLICM/nomap_alreadymapped.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/nomap_escaping.ll b/polly/test/DeLICM/nomap_escaping.ll index 60955368fe59c..6460dbdb808fb 100644 --- a/polly/test/DeLICM/nomap_escaping.ll +++ b/polly/test/DeLICM/nomap_escaping.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/nomap_occupied.ll b/polly/test/DeLICM/nomap_occupied.ll index 9ba8ce2641231..72eea57b8fdf5 100644 --- a/polly/test/DeLICM/nomap_occupied.ll +++ b/polly/test/DeLICM/nomap_occupied.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/nomap_readonly.ll b/polly/test/DeLICM/nomap_readonly.ll index 7a185d336bad3..67bac06f1505f 100644 --- a/polly/test/DeLICM/nomap_readonly.ll +++ b/polly/test/DeLICM/nomap_readonly.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; fsomeval = 21.0 + 21.0; diff --git a/polly/test/DeLICM/nomap_spuriouswrite.ll b/polly/test/DeLICM/nomap_spuriouswrite.ll index 0ed7f6ee8e239..f3fcb0ccd06e4 100644 --- a/polly/test/DeLICM/nomap_spuriouswrite.ll +++ b/polly/test/DeLICM/nomap_spuriouswrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/nomap_storagesize.ll b/polly/test/DeLICM/nomap_storagesize.ll index bf851ac342d20..0f2943a5b1417 100644 --- a/polly/test/DeLICM/nomap_storagesize.ll +++ b/polly/test/DeLICM/nomap_storagesize.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(float *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/nomap_writewrite.ll b/polly/test/DeLICM/nomap_writewrite.ll index 9fcd52aad743c..fc8459a34972c 100644 --- a/polly/test/DeLICM/nomap_writewrite.ll +++ b/polly/test/DeLICM/nomap_writewrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/outofquota-reverseDomain.ll b/polly/test/DeLICM/outofquota-reverseDomain.ll index 1f7527c841208..d48665bdc29c1 100644 --- a/polly/test/DeLICM/outofquota-reverseDomain.ll +++ b/polly/test/DeLICM/outofquota-reverseDomain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-delicm-max-ops=1000000 '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-delicm-max-ops=1000000 '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; This causes an assertion to fail on out-of-quota after 1000000 operations. ; (The error was specific to -polly-delicm-max-ops=1000000 and changes diff --git a/polly/test/DeLICM/pass_existence.ll b/polly/test/DeLICM/pass_existence.ll index 64302d9983261..d784655db60f3 100644 --- a/polly/test/DeLICM/pass_existence.ll +++ b/polly/test/DeLICM/pass_existence.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -passes=polly-delicm -disable-output < %s -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=scop(print<polly-delicm>)' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; Simple test for the existence of the DeLICM pass. ; diff --git a/polly/test/DeLICM/pr41656.ll b/polly/test/DeLICM/pr41656.ll index 2a92503809a24..82799e4fd1ab8 100644 --- a/polly/test/DeLICM/pr41656.ll +++ b/polly/test/DeLICM/pr41656.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-scops -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; llvm.org/PR41656 ; diff --git a/polly/test/DeLICM/pr48783.ll b/polly/test/DeLICM/pr48783.ll index deba8bfcc5daf..10f8b64c3dd2f 100644 --- a/polly/test/DeLICM/pr48783.ll +++ b/polly/test/DeLICM/pr48783.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-scops -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; llvm.org/PR48783 ; diff --git a/polly/test/DeLICM/reduction.ll b/polly/test/DeLICM/reduction.ll index 29b7a3617300b..5d6531f51d570 100644 --- a/polly/test/DeLICM/reduction.ll +++ b/polly/test/DeLICM/reduction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_constant_selfconflict.ll b/polly/test/DeLICM/reduction_constant_selfconflict.ll index 012e0a0794b2b..223a429d76343 100644 --- a/polly/test/DeLICM/reduction_constant_selfconflict.ll +++ b/polly/test/DeLICM/reduction_constant_selfconflict.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-flatten-schedule -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate.ll b/polly/test/DeLICM/reduction_looprotate.ll index 341cc091f7e18..b8eefe5e57cf8 100644 --- a/polly/test/DeLICM/reduction_looprotate.ll +++ b/polly/test/DeLICM/reduction_looprotate.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll b/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll index a58eabb4fbd82..627a4452c3f90 100644 --- a/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll +++ b/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; Verify that delicm can cope with never taken PHI incoming edges. ; The edge %body -> %body_phi is never taken, hence the access MemoryKind::PHI, diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre.ll index 5a81441cf0eea..1d3a789f7ce07 100644 --- a/polly/test/DeLICM/reduction_looprotate_gvnpre.ll +++ b/polly/test/DeLICM/reduction_looprotate_gvnpre.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-partial-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck -check-prefix=PARTIAL %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-partial-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck -check-prefix=PARTIAL %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll index d9c5268e631df..37499cd73020f 100644 --- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll +++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines ; ; Load (but not store) of A[j] hoisted, reduction only over some iterations. ; diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll index 6a4223f5af655..79a700ff122e2 100644 --- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll +++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines ; ; Load (but not store) of A[j] hoisted, reduction not written in all iterations. ; FIXME: %join is not mapped because the MemoryKind::Value mapping does not diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll index bf4b8018d5526..7e82daa9f80fc 100644 --- a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll +++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines ; ; Hosted reduction load (but not the store) without preheader. ; diff --git a/polly/test/DeLICM/reduction_looprotate_hoisted.ll b/polly/test/DeLICM/reduction_looprotate_hoisted.ll index 795b94912aa42..7dc6e0fa9e408 100644 --- a/polly/test/DeLICM/reduction_looprotate_hoisted.ll +++ b/polly/test/DeLICM/reduction_looprotate_hoisted.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-invariant-load-hoisting -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-invariant-load-hoisting '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(int *A, int* StartPtr) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_licm.ll b/polly/test/DeLICM/reduction_looprotate_licm.ll index 935f31abced30..a9c55a8f54087 100644 --- a/polly/test/DeLICM/reduction_looprotate_licm.ll +++ b/polly/test/DeLICM/reduction_looprotate_licm.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_licm2.ll b/polly/test/DeLICM/reduction_looprotate_licm2.ll index 8b06e7466f20a..b98950b71bc85 100644 --- a/polly/test/DeLICM/reduction_looprotate_licm2.ll +++ b/polly/test/DeLICM/reduction_looprotate_licm2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; Use %phi instead of the normal %add; that is, the last last iteration will ; be ignored such the %phi cannot be written to A[3] in %body. diff --git a/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll b/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll index 51bb7291a73ed..4424d904b607d 100644 --- a/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll +++ b/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll @@ -1,7 +1,4 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule \ -; RUN: -polly-delicm-overapproximate-writes=true \ -; RUN: -polly-delicm-compute-known=true -polly-print-delicm \ -; RUN: -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; Make sure delicm works even in case two stores that store the same value. ; diff --git a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll index 027df44e86193..7d20b8d5c7cbf 100644 --- a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll +++ b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; Register-promoted reduction but without preheader. ; diff --git a/polly/test/DeLICM/reduction_looprotate_load.ll b/polly/test/DeLICM/reduction_looprotate_load.ll index 6aa83ae195031..e288a86f30719 100644 --- a/polly/test/DeLICM/reduction_looprotate_load.ll +++ b/polly/test/DeLICM/reduction_looprotate_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(int *A, double* StartPtr) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll index 4ea3fa53a339a..4582f0a36eb5c 100644 --- a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll +++ b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines ; ; Reduction over parametric number of elements and a loopguard if the ; reduction loop is not executed at all. Load hoisted before loop. diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll index 2e7abe444ad65..7df2885e01339 100644 --- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll +++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines ; ; Reduction over parametric number of elements and a loopguard if the ; reduction loop is not executed at all. diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll index 60afdeb5fc97e..a1bd5d3f90fe7 100644 --- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll +++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines ; ; Reduction over parametric number of elements and a loopguard if the ; reduction loop is not executed at all, such that A[j] is also not written to. diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll index e63b457de92db..8329a85ecf13b 100644 --- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll +++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines ; ; Reduction over parametric number of elements and a loopguard if the ; reduction loop is not executed at all, such that A[j] is also not accessed. diff --git a/polly/test/DeLICM/reduction_looprotate_readonly.ll b/polly/test/DeLICM/reduction_looprotate_readonly.ll index a9535467b3bde..5227f42ae4824 100644 --- a/polly/test/DeLICM/reduction_looprotate_readonly.ll +++ b/polly/test/DeLICM/reduction_looprotate_readonly.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A, double Start) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_synthesizable.ll b/polly/test/DeLICM/reduction_looprotate_synthesizable.ll index 3d486910c8612..77d823c8ef6d5 100644 --- a/polly/test/DeLICM/reduction_looprotate_synthesizable.ll +++ b/polly/test/DeLICM/reduction_looprotate_synthesizable.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(int *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_undef.ll b/polly/test/DeLICM/reduction_looprotate_undef.ll index 8c0544ed77852..f70df6075c2d3 100644 --- a/polly/test/DeLICM/reduction_looprotate_undef.ll +++ b/polly/test/DeLICM/reduction_looprotate_undef.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(int *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_overapproximate.ll b/polly/test/DeLICM/reduction_overapproximate.ll index 2d33d3a0ece2a..d6cbb70a84a4a 100644 --- a/polly/test/DeLICM/reduction_overapproximate.ll +++ b/polly/test/DeLICM/reduction_overapproximate.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=true -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=APPROX -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=false -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=EXACT -; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=PARTIAL +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=true -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=APPROX +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=false -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=EXACT +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=PARTIAL ; ; void func(double *A { ; for (int j = -1; j < 3; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_preheader.ll b/polly/test/DeLICM/reduction_preheader.ll index c6e3643797c04..f3ce58b1bc954 100644 --- a/polly/test/DeLICM/reduction_preheader.ll +++ b/polly/test/DeLICM/reduction_preheader.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_unrelatedunusual.ll b/polly/test/DeLICM/reduction_unrelatedunusual.ll index 97826f603e5d4..542cec71ab855 100644 --- a/polly/test/DeLICM/reduction_unrelatedunusual.ll +++ b/polly/test/DeLICM/reduction_unrelatedunusual.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s ; ; Map %add and %phi to A[j]. ; The non-analyzable store to C[0] is unrelated and can be ignored. diff --git a/polly/test/DeLICM/reject_loadafterstore.ll b/polly/test/DeLICM/reject_loadafterstore.ll index 4460620852a85..d56b237aa71d9 100644 --- a/polly/test/DeLICM/reject_loadafterstore.ll +++ b/polly/test/DeLICM/reject_loadafterstore.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reject_outofquota.ll b/polly/test/DeLICM/reject_outofquota.ll index 9bc6bf1f23733..9b7f8e5f97af3 100644 --- a/polly/test/DeLICM/reject_outofquota.ll +++ b/polly/test/DeLICM/reject_outofquota.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-delicm,print<polly-dependences>' -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps;delicm>' -polly-print-deps -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reject_storeafterstore.ll b/polly/test/DeLICM/reject_storeafterstore.ll index ddd13dad2ed31..0fea4d7bb3960 100644 --- a/polly/test/DeLICM/reject_storeafterstore.ll +++ b/polly/test/DeLICM/reject_storeafterstore.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reject_storeinsubregion.ll b/polly/test/DeLICM/reject_storeinsubregion.ll index c987156b51cd1..0b75c16495c5c 100644 --- a/polly/test/DeLICM/reject_storeinsubregion.ll +++ b/polly/test/DeLICM/reject_storeinsubregion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reject_unusualstore.ll b/polly/test/DeLICM/reject_unusualstore.ll index 342888c6654f4..311a7351c955b 100644 --- a/polly/test/DeLICM/reject_unusualstore.ll +++ b/polly/test/DeLICM/reject_unusualstore.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-delicm -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; ; void func(double *A) { diff --git a/polly/test/DeLICM/skip_maywrite.ll b/polly/test/DeLICM/skip_maywrite.ll index 0d30791cd94e7..14de2b9d0bf84 100644 --- a/polly/test/DeLICM/skip_maywrite.ll +++ b/polly/test/DeLICM/skip_maywrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/skip_multiaccess.ll b/polly/test/DeLICM/skip_multiaccess.ll index a7c79f7524630..a213a91343f3d 100644 --- a/polly/test/DeLICM/skip_multiaccess.ll +++ b/polly/test/DeLICM/skip_multiaccess.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; llvm.org/PR34485 ; llvm.org/PR34989 diff --git a/polly/test/DeLICM/skip_notinloop.ll b/polly/test/DeLICM/skip_notinloop.ll index 8e265e19aefea..3a2dede210083 100644 --- a/polly/test/DeLICM/skip_notinloop.ll +++ b/polly/test/DeLICM/skip_notinloop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; double phi = 0.0; diff --git a/polly/test/DeLICM/skip_scalaraccess.ll b/polly/test/DeLICM/skip_scalaraccess.ll index 2cf13afe11cdf..a0ed9f76a8ca2 100644 --- a/polly/test/DeLICM/skip_scalaraccess.ll +++ b/polly/test/DeLICM/skip_scalaraccess.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeadCodeElimination/chained_iterations.ll b/polly/test/DeadCodeElimination/chained_iterations.ll index f3bf07bb40d83..f1e47075e2f74 100644 --- a/polly/test/DeadCodeElimination/chained_iterations.ll +++ b/polly/test/DeadCodeElimination/chained_iterations.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; ; for(i = 0; i < 200; i++ ) diff --git a/polly/test/DeadCodeElimination/chained_iterations_2.ll b/polly/test/DeadCodeElimination/chained_iterations_2.ll index 52f034f0e56ca..6ecc07c0f7d21 100644 --- a/polly/test/DeadCodeElimination/chained_iterations_2.ll +++ b/polly/test/DeadCodeElimination/chained_iterations_2.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; ; for(i = 0; i < 200; i++ ) diff --git a/polly/test/DeadCodeElimination/computeout.ll b/polly/test/DeadCodeElimination/computeout.ll index e54df42ed1db0..b43142be2a5c8 100644 --- a/polly/test/DeadCodeElimination/computeout.ll +++ b/polly/test/DeadCodeElimination/computeout.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" < %s | FileCheck %s -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT +; RUN: opt -S %loadNPMPolly '-passes=polly-custom<dce;ast>' -polly-print-ast < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<dce;ast>' -polly-print-ast -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for(i = 0; i < 100; i++ ) diff --git a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll index c102f60abb659..85eea91f99207 100644 --- a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll +++ b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll @@ -1,4 +1,4 @@ -; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-analysis-type=value-based -polly-dce-precise-steps=2 < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly '-passes=polly-custom<dce;ast>' -polly-print-ast -polly-dependences-analysis-type=value-based -polly-dce-precise-steps=2 < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; ; for(i = 0; i < 200; i++ ) diff --git a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll index 36f55476fed23..21b7c5cf9583b 100644 --- a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll +++ b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; void f(int *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/DeadCodeElimination/non-affine.ll b/polly/test/DeadCodeElimination/non-affine.ll index ef528b4124c66..86cabe6501393 100644 --- a/polly/test/DeadCodeElimination/non-affine.ll +++ b/polly/test/DeadCodeElimination/non-affine.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1) ; diff --git a/polly/test/DeadCodeElimination/null_schedule.ll b/polly/test/DeadCodeElimination/null_schedule.ll index 01d34e95629ba..507d690144e01 100644 --- a/polly/test/DeadCodeElimination/null_schedule.ll +++ b/polly/test/DeadCodeElimination/null_schedule.ll @@ -1,4 +1,4 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; A[0] = 1; ; diff --git a/polly/test/DependenceInfo/computeout.ll b/polly/test/DependenceInfo/computeout.ll index c2a3456b3dc80..3fdc4008f8474 100644 --- a/polly/test/DependenceInfo/computeout.ll +++ b/polly/test/DependenceInfo/computeout.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE -; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT +; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE +; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for(i = 0; i < 100; i++ ) diff --git a/polly/test/DependenceInfo/different_schedule_dimensions.ll b/polly/test/DependenceInfo/different_schedule_dimensions.ll index f89791f42f9db..69274f11f567f 100644 --- a/polly/test/DependenceInfo/different_schedule_dimensions.ll +++ b/polly/test/DependenceInfo/different_schedule_dimensions.ll @@ -1,5 +1,4 @@ -; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' \ -; RUN: -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; CHECK: RAW dependences: ; CHECK: { Stmt_bb9[0] -> Stmt_bb10[0] } diff --git a/polly/test/DependenceInfo/do_pluto_matmult.ll b/polly/test/DependenceInfo/do_pluto_matmult.ll index b88cf9bf5475c..2a0027bbc034b 100644 --- a/polly/test/DependenceInfo/do_pluto_matmult.ll +++ b/polly/test/DependenceInfo/do_pluto_matmult.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/DependenceInfo/fine_grain_dep_0.ll b/polly/test/DependenceInfo/fine_grain_dep_0.ll index 5abbf48136891..06a196822c832 100644 --- a/polly/test/DependenceInfo/fine_grain_dep_0.ll +++ b/polly/test/DependenceInfo/fine_grain_dep_0.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC ; REF: RAW dependences: ; REF-NEXT: [N] -> { [Stmt_for_body[i0] -> MemRef_b[]] -> [Stmt_for_body[6 + i0] -> MemRef_b[]] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[6 + i0] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[4 + i0] : 0 <= i0 <= -11 + N; [Stmt_for_body[i0] -> MemRef_a[]] -> [Stmt_for_body[4 + i0] -> MemRef_a[]] : 0 <= i0 <= -11 + N } diff --git a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll index 677323495476b..9875257694331 100644 --- a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll +++ b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll @@ -1,4 +1,4 @@ -; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE +; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; for (int i = 0; i < N; i++) { diff --git a/polly/test/DependenceInfo/infeasible_context.ll b/polly/test/DependenceInfo/infeasible_context.ll index cde3102dc3dc9..c9473e614e362 100644 --- a/polly/test/DependenceInfo/infeasible_context.ll +++ b/polly/test/DependenceInfo/infeasible_context.ll @@ -1,7 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s -check-prefix=FUNC-SCOP -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(print<polly-dependences>)' -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s -check-prefix=FUNC-DEPS +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=FUNC-SCOP +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-detect -polly-print-deps -disable-output < %s 2>&1 | FileCheck %s -check-prefix=FUNC-DEPS ; ; FUNC-SCOP-NOT: Statement ; FUNC-DEPS-NOT: RAW dependences diff --git a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll index 392a34769cddb..92e6cb89b2a27 100644 --- a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll +++ b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; Verify that the presence of a may-write (S1) between a read (S0) and a ; must-write (S2) does not block the generation of RAW dependences. This makes diff --git a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll index ae5fd3beed399..b14759725dde0 100644 --- a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll +++ b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; CHECK: MayWriteAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/DependenceInfo/reduction_complex_location.ll b/polly/test/DependenceInfo/reduction_complex_location.ll index 7722ee974c3fa..45789088e57e4 100644 --- a/polly/test/DependenceInfo/reduction_complex_location.ll +++ b/polly/test/DependenceInfo/reduction_complex_location.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { } diff --git a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll index 840d1f32dca39..7923975118bb9 100644 --- a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll +++ b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; This loopnest contains a reduction which imposes the same dependences as the ; accesses to the array A. We need to ensure we keep the dependences of A. diff --git a/polly/test/DependenceInfo/reduction_dependences_not_null.ll b/polly/test/DependenceInfo/reduction_dependences_not_null.ll index 56d84a9aec6d6..fdcd5f311800d 100644 --- a/polly/test/DependenceInfo/reduction_dependences_not_null.ll +++ b/polly/test/DependenceInfo/reduction_dependences_not_null.ll @@ -1,7 +1,7 @@ ; Test that the reduction dependences are always initialised, even in a case ; where we have no reduction. If this object is NULL, then isl operations on ; it will fail. -; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE +; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for(i = 0; i < 100; i++ ) diff --git a/polly/test/DependenceInfo/reduction_indirect_access.ll b/polly/test/DependenceInfo/reduction_indirect_access.ll index 3b4bd9ef04b5a..13675ada39b0e 100644 --- a/polly/test/DependenceInfo/reduction_indirect_access.ll +++ b/polly/test/DependenceInfo/reduction_indirect_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-allow-nonaffine -disable-output < %s | FileCheck %s ; ; CHECK: Reduction dependences: ; CHECK: [N] -> { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : 0 <= i0 <= -2 + N } diff --git a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll index 76c7fc64ae89c..e6ce425719ca9 100644 --- a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll +++ b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0 + i1, o1] : i0 >= 0 and 0 <= i1 <= 1023 - i0 and i1 <= 1 and 0 < o1 <= 511 } diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll index 02b814a0d7c04..820371937a582 100644 --- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll +++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll @@ -1,6 +1,6 @@ -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s ; ; Verify that only the inner reduction like accesses cause reduction dependences ; diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll index 91bd35deebd06..9792f791c6989 100644 --- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll +++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { } diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll index 040d513782392..9bde285c64516 100644 --- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll +++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s ; ; CHECK: Reduction dependences: ; CHECK-NEXT: { Stmt_for_inc[i0, i1] -> Stmt_for_inc[i0, 1 + i1] : 0 <= i0 <= 99 and 0 <= i1 <= 98 } diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions.ll b/polly/test/DependenceInfo/reduction_multiple_reductions.ll index 527a8cfc3556e..ac3adb9065462 100644 --- a/polly/test/DependenceInfo/reduction_multiple_reductions.ll +++ b/polly/test/DependenceInfo/reduction_multiple_reductions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; Verify we do not have dependences between the if and the else clause ; diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll index fb5fd96a2e426..16ca85bff9502 100644 --- a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll +++ b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; ; These are the important RAW dependences, as they need to originate/end in only one iteration: diff --git a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll index 3ec3920268b49..de506a39485cc 100644 --- a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll +++ b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; FIXME: Change the comment once we allow different pointers ; The statement is "almost" reduction like but should not yield any reduction dependences diff --git a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll index 23bd8ef25bd7a..fbf1409a1ba30 100644 --- a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll +++ b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s ; ; CHECK: Reduction dependences: ; CHECK-NEXT: [N] -> { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0, 1 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and 1024 - N + i0 <= i1 <= 1022 } diff --git a/polly/test/DependenceInfo/reduction_privatization_deps.ll b/polly/test/DependenceInfo/reduction_privatization_deps.ll index 0e0f71737ffd3..0d66f885cd42d 100644 --- a/polly/test/DependenceInfo/reduction_privatization_deps.ll +++ b/polly/test/DependenceInfo/reduction_privatization_deps.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_S1[i0, i1] -> Stmt_S2[-1 + i0 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and -i0 < i1 <= 1024 - i0 and i1 <= 1023; Stmt_S0[i0] -> Stmt_S1[o0, i0 - o0] : i0 <= 1023 and 0 <= o0 <= i0 } diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll index cafa319e2cc7b..81235d6cf02e4 100644 --- a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll +++ b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; We have privatization dependences from a textually later statement to a ; textually earlier one, but the dependences still go forward in time. diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll index d86da92fbcab8..6b48ab5afd155 100644 --- a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll +++ b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_S1[i0] -> Stmt_S3[2 + i0] : 0 <= i0 <= 96; Stmt_S2[i0, i1] -> Stmt_S3[o0] : i1 <= 1 - i0 and -i1 < o0 <= 1 and o0 <= 1 + i0 - i1; Stmt_S3[i0] -> Stmt_S2[o0, 1 - i0] : 0 <= i0 <= 1 and i0 < o0 <= 98 } diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll index d84c04fc309b0..1fef004c4c47a 100644 --- a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll +++ b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_S1[i0] -> Stmt_S2[i0, i0] : 0 <= i0 <= 98; Stmt_S2[i0, i0] -> Stmt_S3[i0] : 0 <= i0 <= 98; Stmt_S3[i0] -> Stmt_S2[o0, i0] : i0 >= 0 and i0 < o0 <= 98; Stmt_S2[i0, i1] -> Stmt_S1[i1] : i0 >= 0 and i0 < i1 <= 98 } diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll index 592c7238c3c59..f40a7c07a3ba4 100644 --- a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll +++ b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_S1[i0, 0] -> Stmt_S2[i0, 0] : 0 <= i0 <= 98; Stmt_S2[i0, 0] -> Stmt_S1[1 + i0, 0] : 0 <= i0 <= 97 } diff --git a/polly/test/DependenceInfo/reduction_sequence.ll b/polly/test/DependenceInfo/reduction_sequence.ll index 7ce9d37d395bb..d881a99adc226 100644 --- a/polly/test/DependenceInfo/reduction_sequence.ll +++ b/polly/test/DependenceInfo/reduction_sequence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; void manyreductions(long *A) { ; for (long i = 0; i < 1024; i++) diff --git a/polly/test/DependenceInfo/reduction_simple_iv.ll b/polly/test/DependenceInfo/reduction_simple_iv.ll index d13d14ecaad92..b811d1593ab02 100644 --- a/polly/test/DependenceInfo/reduction_simple_iv.ll +++ b/polly/test/DependenceInfo/reduction_simple_iv.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { } diff --git a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll index 4c97fbb1aacb7..0a5d36f9b9f79 100644 --- a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll +++ b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s ; ; REQUIRES: asserts ; diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll index 804005cf72a72..90f9d76ef57b2 100644 --- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll +++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_S1[i0, i1] -> Stmt_S2[i0] : 0 <= i0 <= 99 and 0 <= i1 <= 99; Stmt_S0[i0] -> Stmt_S1[i0, o1] : 0 <= i0 <= 99 and 0 <= o1 <= 99; Stmt_S2[i0] -> Stmt_S0[1 + i0] : 0 <= i0 <= 98 } diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll index 9596827b4cbbf..2b194bbb51988 100644 --- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll +++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: [N] -> { Stmt_S1[i0] -> Stmt_S2[] : N >= 11 and 0 <= i0 <= 1023; Stmt_S0[] -> Stmt_S1[o0] : N >= 11 and 0 <= o0 <= 1023 } diff --git a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll index d67683d11a4b3..70d5bdf64059d 100644 --- a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll +++ b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { } diff --git a/polly/test/DependenceInfo/sequential_loops.ll b/polly/test/DependenceInfo/sequential_loops.ll index 6ae7200303321..023c2d4f29f37 100644 --- a/polly/test/DependenceInfo/sequential_loops.ll +++ b/polly/test/DependenceInfo/sequential_loops.ll @@ -1,6 +1,6 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS ; VALUE: RAW dependences: ; VALUE-NEXT: { } diff --git a/polly/test/FlattenSchedule/gemm.ll b/polly/test/FlattenSchedule/gemm.ll index b20293bd315a3..11dc40599bb0e 100644 --- a/polly/test/FlattenSchedule/gemm.ll +++ b/polly/test/FlattenSchedule/gemm.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-flatten-schedule -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten>' -polly-print-flatten-schedule -disable-output < %s | FileCheck %s ; ; dgemm kernel ; C := alpha*A*B + beta*C diff --git a/polly/test/ForwardOpTree/atax.ll b/polly/test/ForwardOpTree/atax.ll index 6c81fb12e8cdc..3dfe3fa0aa8e6 100644 --- a/polly/test/ForwardOpTree/atax.ll +++ b/polly/test/ForwardOpTree/atax.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ForwardOpTree/changed-kind.ll b/polly/test/ForwardOpTree/changed-kind.ll index b9081f3734044..ec8869da3ae57 100644 --- a/polly/test/ForwardOpTree/changed-kind.ll +++ b/polly/test/ForwardOpTree/changed-kind.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; In the code below, %0 is known to be equal to the content of @c (constant 0). ; Thus, in order to save a scalar dependency, forward-optree replaces diff --git a/polly/test/ForwardOpTree/forward_from_region.ll b/polly/test/ForwardOpTree/forward_from_region.ll index 767a580dccf95..de47bc4df0076 100644 --- a/polly/test/ForwardOpTree/forward_from_region.ll +++ b/polly/test/ForwardOpTree/forward_from_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Move instructions from region statements. ; diff --git a/polly/test/ForwardOpTree/forward_hoisted.ll b/polly/test/ForwardOpTree/forward_hoisted.ll index 5d0b0a884b761..39f99545b01ac 100644 --- a/polly/test/ForwardOpTree/forward_hoisted.ll +++ b/polly/test/ForwardOpTree/forward_hoisted.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify). ; This involves making the load-hoisted %val1 to be made available in %bodyB. diff --git a/polly/test/ForwardOpTree/forward_instruction.ll b/polly/test/ForwardOpTree/forward_instruction.ll index 50a9b07b8a05b..a9f5d3d85ac0a 100644 --- a/polly/test/ForwardOpTree/forward_instruction.ll +++ b/polly/test/ForwardOpTree/forward_instruction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify) ; diff --git a/polly/test/ForwardOpTree/forward_into_region.ll b/polly/test/ForwardOpTree/forward_into_region.ll index ef71b11dc5716..2279a89cfaeb7 100644 --- a/polly/test/ForwardOpTree/forward_into_region.ll +++ b/polly/test/ForwardOpTree/forward_into_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Move instructions to region statements. ; diff --git a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll index 1c585446ae63a..f7901e1ccf8fd 100644 --- a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll +++ b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; define void @foo(ptr %A, i32 %p, ptr %B) { diff --git a/polly/test/ForwardOpTree/forward_load.ll b/polly/test/ForwardOpTree/forward_load.ll index 0bba41833fb19..860e603ef47d2 100644 --- a/polly/test/ForwardOpTree/forward_load.ll +++ b/polly/test/ForwardOpTree/forward_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-optree>)" -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load. ; diff --git a/polly/test/ForwardOpTree/forward_load_differentarray.ll b/polly/test/ForwardOpTree/forward_load_differentarray.ll index 364bf3ef37133..24b008cfae384 100644 --- a/polly/test/ForwardOpTree/forward_load_differentarray.ll +++ b/polly/test/ForwardOpTree/forward_load_differentarray.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; To forward %val, B[j] cannot be reused in bodyC because it is overwritten ; between. Verify that instead the alternative C[j] is used. diff --git a/polly/test/ForwardOpTree/forward_load_double_write.ll b/polly/test/ForwardOpTree/forward_load_double_write.ll index 4c30c7f8da56f..522e803b2d0a0 100644 --- a/polly/test/ForwardOpTree/forward_load_double_write.ll +++ b/polly/test/ForwardOpTree/forward_load_double_write.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load even in case two writes of identical values are in ; one scop statement. diff --git a/polly/test/ForwardOpTree/forward_load_fromloop.ll b/polly/test/ForwardOpTree/forward_load_fromloop.ll index 1494e872a8942..5c64221d882b9 100644 --- a/polly/test/ForwardOpTree/forward_load_fromloop.ll +++ b/polly/test/ForwardOpTree/forward_load_fromloop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Forward a the LoadInst %val into %bodyB. %val is executed multiple times, ; we must get the last loaded values. diff --git a/polly/test/ForwardOpTree/forward_load_indirect.ll b/polly/test/ForwardOpTree/forward_load_indirect.ll index 51ce94d267277..5b06c357f02ba 100644 --- a/polly/test/ForwardOpTree/forward_load_indirect.ll +++ b/polly/test/ForwardOpTree/forward_load_indirect.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Forward an operand tree consisting of a speculatable instruction (%add) ; and a load (%val). diff --git a/polly/test/ForwardOpTree/forward_load_memset_after.ll b/polly/test/ForwardOpTree/forward_load_memset_after.ll index bd2cad411eccf..b889783d531e6 100644 --- a/polly/test/ForwardOpTree/forward_load_memset_after.ll +++ b/polly/test/ForwardOpTree/forward_load_memset_after.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load in the presence of a non-store WRITE access. ; diff --git a/polly/test/ForwardOpTree/forward_load_memset_before.ll b/polly/test/ForwardOpTree/forward_load_memset_before.ll index 3e89dea37775c..c8f0e0e5814fb 100644 --- a/polly/test/ForwardOpTree/forward_load_memset_before.ll +++ b/polly/test/ForwardOpTree/forward_load_memset_before.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load in the presence of a non-store WRITE access. ; diff --git a/polly/test/ForwardOpTree/forward_load_tripleuse.ll b/polly/test/ForwardOpTree/forward_load_tripleuse.ll index 7526a8313945d..df57bf70cc53b 100644 --- a/polly/test/ForwardOpTree/forward_load_tripleuse.ll +++ b/polly/test/ForwardOpTree/forward_load_tripleuse.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>,polly-codegen' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; %val1 is used three times: Twice by its own operand tree of %val2 and once ; more by the store in %bodyB. diff --git a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll index daf289d8b0da1..ba84a1a16748f 100644 --- a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll +++ b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load. ; The non-analyzable store to C[0] is unrelated and can be ignored. diff --git a/polly/test/ForwardOpTree/forward_phi_load.ll b/polly/test/ForwardOpTree/forward_phi_load.ll index 1457aa96e2de7..c763af4269c89 100644 --- a/polly/test/ForwardOpTree/forward_phi_load.ll +++ b/polly/test/ForwardOpTree/forward_phi_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load. ; diff --git a/polly/test/ForwardOpTree/forward_readonly.ll b/polly/test/ForwardOpTree/forward_readonly.ll index 646121c4efeff..69c7f10be4e56 100644 --- a/polly/test/ForwardOpTree/forward_readonly.ll +++ b/polly/test/ForwardOpTree/forward_readonly.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL -; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL +; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL +; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL ; ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify) ; diff --git a/polly/test/ForwardOpTree/forward_reusue.ll b/polly/test/ForwardOpTree/forward_reusue.ll index d8ad31782ecb9..e39e7b51dc689 100644 --- a/polly/test/ForwardOpTree/forward_reusue.ll +++ b/polly/test/ForwardOpTree/forward_reusue.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Move operand tree without duplicating values used multiple times. ; diff --git a/polly/test/ForwardOpTree/forward_store.ll b/polly/test/ForwardOpTree/forward_store.ll index 17cb8b395eb30..8cd6e2446ff93 100644 --- a/polly/test/ForwardOpTree/forward_store.ll +++ b/polly/test/ForwardOpTree/forward_store.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load. ; diff --git a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll index 57b68180bb121..f70965f3c5d1b 100644 --- a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll +++ b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Copy %val to bodyB, assuming the exit value of %i. ; diff --git a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll index b4828e4c2c423..c95c45856ac36 100644 --- a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll +++ b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Test support for (synthesizable) inducation variables. ; diff --git a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll index 3228bb60d2ca2..14fb8d8dcc0ab 100644 --- a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll +++ b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Synthesizable values defined outside of a loop can be used ; inside the loop. diff --git a/polly/test/ForwardOpTree/forward_transitive.ll b/polly/test/ForwardOpTree/forward_transitive.ll index aacf1358648f5..7b55d9e0cf9b2 100644 --- a/polly/test/ForwardOpTree/forward_transitive.ll +++ b/polly/test/ForwardOpTree/forward_transitive.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Move %v and %val to %bodyB, so %bodyA can be removed (by -polly-simplify) ; diff --git a/polly/test/ForwardOpTree/jacobi-1d.ll b/polly/test/ForwardOpTree/jacobi-1d.ll index cb035bb749c7b..3bc504d88c0eb 100644 --- a/polly/test/ForwardOpTree/jacobi-1d.ll +++ b/polly/test/ForwardOpTree/jacobi-1d.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ForwardOpTree/noforward_from_region.ll b/polly/test/ForwardOpTree/noforward_from_region.ll index bd5864c25f543..0729241c3f7d9 100644 --- a/polly/test/ForwardOpTree/noforward_from_region.ll +++ b/polly/test/ForwardOpTree/noforward_from_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Ensure we do not move instructions from region statements in case the ; instruction to move loads from an array which is also written to from diff --git a/polly/test/ForwardOpTree/noforward_load_conditional.ll b/polly/test/ForwardOpTree/noforward_load_conditional.ll index 5474e740de800..d33ef99ae6bed 100644 --- a/polly/test/ForwardOpTree/noforward_load_conditional.ll +++ b/polly/test/ForwardOpTree/noforward_load_conditional.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; B[j] is overwritten by at least one statement between the ; definition of %val and its use. Hence, it cannot be forwarded. diff --git a/polly/test/ForwardOpTree/noforward_load_writebetween.ll b/polly/test/ForwardOpTree/noforward_load_writebetween.ll index 697c940be4fdd..e7deb381de87a 100644 --- a/polly/test/ForwardOpTree/noforward_load_writebetween.ll +++ b/polly/test/ForwardOpTree/noforward_load_writebetween.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Cannot rematerialize %val from B[0] at bodyC because B[0] has been ; overwritten in bodyB. diff --git a/polly/test/ForwardOpTree/noforward_outofquota.ll b/polly/test/ForwardOpTree/noforward_outofquota.ll index 306bb8d7558d1..5e30cf88de4cf 100644 --- a/polly/test/ForwardOpTree/noforward_outofquota.ll +++ b/polly/test/ForwardOpTree/noforward_outofquota.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 -passes=polly-optree -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS +; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=polly-custom<optree>' -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS ; REQUIRES: asserts ; ; for (int j = 0; j < n; j += 1) { diff --git a/polly/test/ForwardOpTree/noforward_partial.ll b/polly/test/ForwardOpTree/noforward_partial.ll index edb5d34801cc5..f95bb77f70b67 100644 --- a/polly/test/ForwardOpTree/noforward_partial.ll +++ b/polly/test/ForwardOpTree/noforward_partial.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Not the entire operand tree can be forwarded, ; some scalar dependencies would remain. diff --git a/polly/test/ForwardOpTree/noforward_phi.ll b/polly/test/ForwardOpTree/noforward_phi.ll index 755abad4336ef..025fe64724151 100644 --- a/polly/test/ForwardOpTree/noforward_phi.ll +++ b/polly/test/ForwardOpTree/noforward_phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Do not move PHI nodes. ; diff --git a/polly/test/ForwardOpTree/noforward_selfrefphi.ll b/polly/test/ForwardOpTree/noforward_selfrefphi.ll index be7e82f726331..8b30137858243 100644 --- a/polly/test/ForwardOpTree/noforward_selfrefphi.ll +++ b/polly/test/ForwardOpTree/noforward_selfrefphi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Contains a self-referencing PHINode that would require a ; transitive closure to handle. diff --git a/polly/test/ForwardOpTree/noforward_sideffects.ll b/polly/test/ForwardOpTree/noforward_sideffects.ll index c01b72a1c1420..179b02a259025 100644 --- a/polly/test/ForwardOpTree/noforward_sideffects.ll +++ b/polly/test/ForwardOpTree/noforward_sideffects.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Do not forward instructions with side-effects (here: function call). ; diff --git a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll index 776d848072a23..6baec6d9e1c6c 100644 --- a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll +++ b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines ; ; Do not try to forward %i.trunc, it is not synthesizable in %body. ; diff --git a/polly/test/ForwardOpTree/out-of-quota1.ll b/polly/test/ForwardOpTree/out-of-quota1.ll index ee3e32698dd02..95df49a5c061a 100644 --- a/polly/test/ForwardOpTree/out-of-quota1.ll +++ b/polly/test/ForwardOpTree/out-of-quota1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output %s | FileCheck %s ; This used to loop infinitely because of UINT_MAX returned by ISL on out-of-quota. diff --git a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll index ec1ccdce94508..a5102b3557f0c 100644 --- a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s ; ; void jd(int *A) { ; CHECK: #pragma omp parallel for diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll index 9c00690605408..d086b59f97a5a 100644 --- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < 1024; i++) diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll index 356762a2ae5b9..49a6b0531de56 100644 --- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll +++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; int A[1024][1024]; ; void bar(int n) { diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll index 066fc39def6ac..d2d7917b08528 100644 --- a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < n; i++) diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll index 77dd55cb7605e..c03189a211256 100644 --- a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < n; i++) diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll index b61ebc9379b7f..6829211cc76b9 100644 --- a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < n; i++) diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll index 5c92a91681867..7199a337d8a4f 100644 --- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < n; i++) diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll index 352d879199675..41d35bfdb3631 100644 --- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll +++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < n; i++) diff --git a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll index 81c29536010b6..356269cefad36 100644 --- a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll +++ b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/IstAstInfo/alias_simple_1.ll b/polly/test/IstAstInfo/alias_simple_1.ll index 904f55dc32ce4..039c5f74fabfe 100644 --- a/polly/test/IstAstInfo/alias_simple_1.ll +++ b/polly/test/IstAstInfo/alias_simple_1.ll @@ -1,8 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB ; ; int A[1024]; ; diff --git a/polly/test/IstAstInfo/alias_simple_2.ll b/polly/test/IstAstInfo/alias_simple_2.ll index 5fae579995b23..1783a04f02be9 100644 --- a/polly/test/IstAstInfo/alias_simple_2.ll +++ b/polly/test/IstAstInfo/alias_simple_2.ll @@ -1,9 +1,9 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE ; ; int A[1024], B[1024]; ; diff --git a/polly/test/IstAstInfo/alias_simple_3.ll b/polly/test/IstAstInfo/alias_simple_3.ll index 8599c29934744..8d507fb82cb2d 100644 --- a/polly/test/IstAstInfo/alias_simple_3.ll +++ b/polly/test/IstAstInfo/alias_simple_3.ll @@ -1,8 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB ; ; int A[1024]; ; float B[1024]; diff --git a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll index dc21dc1f96a48..01b5372917358 100644 --- a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll +++ b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \ -; RUN: -polly-invariant-load-hoisting \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output -polly-invariant-load-hoisting < %s | FileCheck %s ; CHECK: if (1 && 1 && (&MemRef_X[1] <= &MemRef_BaseA[0] || &MemRef_BaseA[1024] <= &MemRef_X[0]) && (&MemRef_X[1] <= &MemRef_BaseB[0] || &MemRef_BaseB[1024] <= &MemRef_X[0])) diff --git a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll index 8d4adfa405f07..3835c23fecddb 100644 --- a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll +++ b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA ; ; void jd(int *Int0, int *Int1, float *Float0, float *Float1) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll index be37b27b6e375..71bac9a2bb141 100644 --- a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll +++ b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output %s | FileCheck %s ; ; void jd(int *A, int *B, int c) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll index 15550583340db..e5ece1f57a85e 100644 --- a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll +++ b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; void jd(int *A, int *B, int c) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/IstAstInfo/dependence_distance_constant.ll b/polly/test/IstAstInfo/dependence_distance_constant.ll index 9b7fb93f2f676..43b13eef9a95b 100644 --- a/polly/test/IstAstInfo/dependence_distance_constant.ll +++ b/polly/test/IstAstInfo/dependence_distance_constant.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *A, int N) { ; CHECK: #pragma minimal dependence distance: 1 diff --git a/polly/test/IstAstInfo/dependence_distance_minimal.ll b/polly/test/IstAstInfo/dependence_distance_minimal.ll index d69cc3f9fc3f8..35a503ce7eb8d 100644 --- a/polly/test/IstAstInfo/dependence_distance_minimal.ll +++ b/polly/test/IstAstInfo/dependence_distance_minimal.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; The minimal dependence distance of the innermost loop should be 1 instead of 250. ; CHECK: #pragma minimal dependence distance: 1 diff --git a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll index bc21e9e07ad89..a7de5c4876385 100644 --- a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll +++ b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *restrict A, int *restrict B, int N) { ; CHECK: #pragma minimal dependence distance: 5 diff --git a/polly/test/IstAstInfo/dependence_distance_parametric.ll b/polly/test/IstAstInfo/dependence_distance_parametric.ll index fa569a8386b86..fa05e4c889031 100644 --- a/polly/test/IstAstInfo/dependence_distance_parametric.ll +++ b/polly/test/IstAstInfo/dependence_distance_parametric.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *A, int N, int c) { ; CHECK: #pragma minimal dependence distance: 1 diff --git a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll index 7f280e0c542ca..73f74b3bce0b1 100644 --- a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll +++ b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *A, int N, int c, int v) { ; CHECK: #pragma minimal dependence distance: 1 diff --git a/polly/test/IstAstInfo/dependence_distance_varying.ll b/polly/test/IstAstInfo/dependence_distance_varying.ll index d609c2f210f8d..e908954536600 100644 --- a/polly/test/IstAstInfo/dependence_distance_varying.ll +++ b/polly/test/IstAstInfo/dependence_distance_varying.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *A, int N) { ; CHECK: #pragma minimal dependence distance: -(N % 2) + 2 diff --git a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll index 8ed3220353c1b..1668fc0515441 100644 --- a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll +++ b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-canonicalize -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *restrict A, int *restrict sum) { ; CHECK: #pragma minimal dependence distance: 1 diff --git a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll index 73768e9c308a4..0d0aa8bea31d8 100644 --- a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll +++ b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *restrict A, int *restrict B, int *restrict C, int *restrict D, ; int *restrict E, int N) { diff --git a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll index e2cf0bd9c0df2..2ed94e59e8087 100644 --- a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll +++ b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; CHECK: { ; CHECK-NEXT: if (p <= -1 || p >= 1) diff --git a/polly/test/IstAstInfo/non_affine_access.ll b/polly/test/IstAstInfo/non_affine_access.ll index 98e8d2db959f8..a285a8f032f5e 100644 --- a/polly/test/IstAstInfo/non_affine_access.ll +++ b/polly/test/IstAstInfo/non_affine_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s ; ; void non_affine_access(float A[]) { ; for (long i = 0; i < 1024; i++) diff --git a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll index 697b6ca50d444..3fefc74efbef0 100644 --- a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll +++ b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum) ; void f(int N, int M, int P, int sum[P][M]) { diff --git a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll index c20a7d6db13c9..41bd178c73c2a 100644 --- a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll +++ b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum) ; void f(int N, int M, int *sum) { diff --git a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll index e6092f0b068f8..5aa8a0c244423 100644 --- a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll +++ b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; This loopnest contains a reduction which imposes the same dependences as the ; accesses to the array A. We need to ensure we do __not__ parallelize anything diff --git a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll index 14de70f9357c3..91f7c9d9601bc 100644 --- a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll +++ b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma simd reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and) ; CHECK: #pragma known-parallel reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and) diff --git a/polly/test/IstAstInfo/reduction_in_one_dimension.ll b/polly/test/IstAstInfo/reduction_in_one_dimension.ll index 797115b6f8d70..d0173bcd978ca 100644 --- a/polly/test/IstAstInfo/reduction_in_one_dimension.ll +++ b/polly/test/IstAstInfo/reduction_in_one_dimension.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; Verify that we won't privatize anything in the outer dimension ; diff --git a/polly/test/IstAstInfo/reduction_loop_reversal.ll b/polly/test/IstAstInfo/reduction_loop_reversal.ll index d30119787d8e0..d010e26f739a6 100644 --- a/polly/test/IstAstInfo/reduction_loop_reversal.ll +++ b/polly/test/IstAstInfo/reduction_loop_reversal.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK-NOT: #pragma simd{{\s*$}} ; CHECK: #pragma simd reduction diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll index 15fca884c2b63..7f78badfcb93c 100644 --- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll +++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel reduction (+ : MemRef_A) ; CHECK-NEXT: for (int c0 = 0; c0 <= 2; c0 += 1) { diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll index 44e9aa4d1e569..42e9c3b19eb1b 100644 --- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll +++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel reduction ; CHECK: for (int c0 = 0; c0 <= 2; c0 += 1) { diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_schedule.ll index c39ffa591484d..8bdd5299986eb 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel reduction (+ : MemRef_A) ; CHECK-NEXT: for (int c0 = 0; c0 <= 2; c0 += 1) { diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll index 266753555cab1..4811069e4f399 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel ; CHECK: for (int c0 = 0; c0 <= 1; c0 += 1) diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll index d7f9029fd347a..4f5ac24a0b005 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; Verify that the outer dimension doesn't carry reduction dependences ; diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll index f18060a2e20a8..472a04847ec95 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; Verify that the outer dimension doesn't carry reduction dependences ; diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll index 8e2a590c5f57c..2cc911d78234b 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; Verify that the outer dimension doesn't carry reduction dependences ; diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll index b889db4819cd5..1b2d0eb75c12c 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; Verify that only the outer dimension needs privatization ; diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll index 2a8fd7a4f670e..884cea7918031 100644 --- a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll +++ b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll index 25f2fa597e34e..013a7d4f3ad27 100644 --- a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll +++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll index 0d6be9a9da9bf..2dc6d8680b36a 100644 --- a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll +++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll index 8b537513cc8d7..dcd75945d25a8 100644 --- a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll +++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/IstAstInfo/run-time-condition.ll b/polly/test/IstAstInfo/run-time-condition.ll index 44d3534f651ce..67fc4b74571da 100644 --- a/polly/test/IstAstInfo/run-time-condition.ll +++ b/polly/test/IstAstInfo/run-time-condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; for (i = 0; i < 1024; i++) ; A[i] = B[i]; diff --git a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll index aef509a865b6a..d674f429c0d48 100644 --- a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll +++ b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s ; ; Verify we do not simplify the runtime check to "true" due to the domain ; constraints as the test contains an error block that influenced the domains diff --git a/polly/test/IstAstInfo/simple-run-time-condition.ll b/polly/test/IstAstInfo/simple-run-time-condition.ll index 488cd180b899a..73a7c596cea0b 100644 --- a/polly/test/IstAstInfo/simple-run-time-condition.ll +++ b/polly/test/IstAstInfo/simple-run-time-condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/polly/test/IstAstInfo/single_loop_strip_mine.ll b/polly/test/IstAstInfo/single_loop_strip_mine.ll index afe6179188c01..f546972fb370c 100644 --- a/polly/test/IstAstInfo/single_loop_strip_mine.ll +++ b/polly/test/IstAstInfo/single_loop_strip_mine.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-ast-print-accesses -polly-ast-detect-parallel '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-ast-print-accesses -polly-ast-detect-parallel '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR ; for (i = 0; i < 1024; i++) ; A[i] = B[i]; diff --git a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll index f614f90fc3fc9..c9ae9e8f4e52e 100644 --- a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll +++ b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; XFAIL: * ;#include "limits.h" diff --git a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll index e91ea13278692..45227160e8699 100644 --- a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll +++ b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; XFAIL: * ;#include "limits.h" diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll index 49a962592bb9d..28b6a7ca12799 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: expecting other token ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll index 749b962b260f5..f19a632815795 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: Statement from JScop file has no key name 'accesses' for index 1. ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll index 1d97e3ebca625..77b9acfbb0989 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: The number of memory accesses in the JSop file and the number of memory accesses differ for index 0. ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll index f4b739398f9f6..0a06ff671c298 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: The number of indices and the number of statements differ. ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll index 1f5cda3518a2f..35b7af098ae42 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: Memory access number 0 has no key name 'relation' for statement number 1. ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll index 0c750849b51eb..109665a85c607 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: JScop file has no key name 'statements'. ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll index d8c9c3f4ab2ea..f345d1c31796e 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: JScop file contains access function with undeclared ScopArrayInfo ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll index f8d7cb8c1453e..a66d5c8c69b55 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: JScop file changes the number of parameter dimensions. ; diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll index 6e13a5e413d76..ae0b4edffb5fc 100644 --- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll +++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll @@ -1,4 +1,4 @@ - ; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Array has not a valid type. ; diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll index 7f6578776e0bd..6c434e15a38d2 100644 --- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll +++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: not --crash opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s ; ; #define Ni 1056 ; #define Nj 1056 diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll index e698bdc488c2c..b004c4725176a 100644 --- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll +++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Array has no key 'name'. ; diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll index f130b6556e3e5..5f62a457f63eb 100644 --- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll +++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Array has no key 'sizes'. ; diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll index 68d2e50c6730d..029fde10f5a4a 100644 --- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll +++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Array has no key 'type'. ; diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll index 94c77dc2a0138..9ac371b655146 100644 --- a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll +++ b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: JScop file has no key named 'context'. ; diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll index c20d5c02d662e..82afcd95c871f 100644 --- a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll +++ b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: The isl_set is not a parameter set. ; diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll index 92f4d61212e93..0308452c6f955 100644 --- a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll +++ b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: unexpected isl_token ; diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll index 89668d8d573b1..debb9bc604110 100644 --- a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll +++ b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: Imported context has the wrong number of parameters : Found 2 Expected 1 ; diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll index efe15c14ce90d..6eee0056ba0b5 100644 --- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll +++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: Statement 0 has no 'schedule' key. ; diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll index db516f6d7d335..59feb0085e6de 100644 --- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll +++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: expecting other token ; diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll index b93c984d7d9dd..78d5243d34e00 100644 --- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll +++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: JScop file has no key name 'statements'. ; diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll index 3fa14c64cd639..877547c8f317f 100644 --- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll +++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: The number of indices and the number of statements differ. ; diff --git a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll index 1d81ff7ef2dc8..9f999204f59bf 100644 --- a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll +++ b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that the expansion of an array with load after store in a same statement is not done. ; diff --git a/polly/test/MaximalStaticExpansion/read_from_original.ll b/polly/test/MaximalStaticExpansion/read_from_original.ll index 57017381c661a..1a733c113626d 100644 --- a/polly/test/MaximalStaticExpansion/read_from_original.ll +++ b/polly/test/MaximalStaticExpansion/read_from_original.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that Polly detects problems and does not expand the array ; diff --git a/polly/test/MaximalStaticExpansion/too_many_writes.ll b/polly/test/MaximalStaticExpansion/too_many_writes.ll index 7e33de17a1749..a7aa162aa83da 100644 --- a/polly/test/MaximalStaticExpansion/too_many_writes.ll +++ b/polly/test/MaximalStaticExpansion/too_many_writes.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that Polly detects problems and does not expand the array ; diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll index 355fc02600d54..06e08c43e3492 100644 --- a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll +++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s ; ; Verify that the accesses are correctly expanded for MemoryKind::Array ; diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll index 930539547cc97..076f47143dbcc 100644 --- a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll +++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that the accesses are correctly expanded for MemoryKind::Array and MemoryKind::PHI. ; tmp_06_phi is not expanded because it need copy in. diff --git a/polly/test/MaximalStaticExpansion/working_expansion.ll b/polly/test/MaximalStaticExpansion/working_expansion.ll index a055e50225e91..2b040f3f1f4e3 100644 --- a/polly/test/MaximalStaticExpansion/working_expansion.ll +++ b/polly/test/MaximalStaticExpansion/working_expansion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s ; ; Verify that the accesses are correctly expanded for MemoryKind::Array ; diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll index 77338c9aac200..f863c0e1d6edf 100644 --- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll +++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s ; ; Verify that the accesses are correctly expanded ; diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll index 9cfa5536072b7..a823bdb4e7682 100644 --- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll +++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s ; ; Verify that the accesses are correctly expanded ; diff --git a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll index 63e4d48046275..0898f99c896d4 100644 --- a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll +++ b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that the accesses are correctly expanded for MemoryKind::PHI ; tmp_04 is not expanded because it need copy-in. diff --git a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll index 87bd57abab8d1..2a332ba7ce77b 100644 --- a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll +++ b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that the accesses are correctly expanded for MemoryKind::PHI ; tmp_05 and tmp2_06 are not expanded because they need copy-in. diff --git a/polly/test/MaximalStaticExpansion/working_value_expansion.ll b/polly/test/MaximalStaticExpansion/working_value_expansion.ll index cc28a78c38671..77f20bb163a8b 100644 --- a/polly/test/MaximalStaticExpansion/working_value_expansion.ll +++ b/polly/test/MaximalStaticExpansion/working_value_expansion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s ; ; Verify that the accesses are correctly expanded for MemoryKind::Value ; diff --git a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll index 9cc2aecf002dd..b4524c21a35ee 100644 --- a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll +++ b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false "-passes=scop(polly-prune-unprofitable)" -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=polly-custom<prune>' -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s ; REQUIRES: asserts ; ; Skip this SCoP for having scalar dependencies between all statements, diff --git a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll index 38facb1688c46..c8c006c94d1d4 100644 --- a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll +++ b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -S < %s target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32" define void @sdbout_label() nounwind { diff --git a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll index 835986049899b..23033faa380af 100644 --- a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll +++ b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -S < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Check that we handle statements with an empty iteration domain correctly. diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll index 5e4ce8225a236..fdaed3c543673 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll index de4c387a1d879..65d495722c2bd 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll index 91bd549c3c7e4..06d55f46a977f 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll index 8b69d9e12c0fe..0af703ccf5ffe 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT define void @func(i32 %n, ptr noalias nonnull %A) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll index 49d1124740340..ca6840b900e7f 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll index a449a2fda9ba3..f96e4baba71eb 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s define void @func(i32 %n, ptr noalias nonnull %A) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll index 798e9b9a7c14f..229d13aaf1a4d 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s define void @func(i32 %n, ptr noalias nonnull %A) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll index 4d0ccc988a5cc..9bc9a25ac588e 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s ; This could theoretically be fused by adjusting the offset of the second loop by %k (instead of relying on schedule dimensions). diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll index bf470b91a7022..5b0cefbe686f6 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) { entry: diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll index b0f75dd50ef83..2225f05f6717d 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s -match-full-lines ; ; Check that the disable_nonforced metadata is honored; optimization ; heuristics/rescheduling must not be applied. diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll index 900360d7533f8..4add219214aa3 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF ; define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B) { entry: diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll index d45b62433dbbc..d59f9e58e2785 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines ; ; CHECK: warning: distribute_illegal.c:2:3: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll index d835e66693fb4..a1caaf5db5a61 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines ; ; CHECK: warning: distribute_illegal.c:1:42: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll index a5781a7f60365..b05710203fd37 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines ; ; Override unroll metadata with llvm.loop.unroll.disable. ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll index cccf136a1c4ac..8992bc942646e 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines ; ; Apply two loop transformations. First partial, then full unrolling. ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll index 4d499078a4364..7bea96f791a80 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines ; ; Full unroll of a loop with 5 iterations. ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll index d67472ab86936..34a6f486e646c 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines -; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines ; ; Unrolling with heuristic factor. ; Currently not supported and expected to be handled by LLVM's unroll pass. diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll index 90101b4fde390..ce2281372a20d 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines -; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines +; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines ; ; Partial unroll by a factor of 4. ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll index 4cfa3fb911515..f6810ba6c48fb 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines -; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines -; RUN: opt %loadNPMPolly '-passes=scop(polly-opt-isl,polly-codegen),simplifycfg' -S < %s | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>' -S < %s | FileCheck %s --check-prefix=CODEGEN ; ; Partial unroll by a factor of 4. ; @@ -54,6 +54,6 @@ return: ; AST-NEXT: for (int c0 = 0; c0 < n; c0 += 4) { -; CODEGEN: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.exiting, !llvm.loop ![[LOOPID:[0-9]+]] +; CODEGEN: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit, !llvm.loop ![[LOOPID:[0-9]+]] ; CODEGEN: ![[LOOPID]] = distinct !{![[LOOPID]], ![[LOOPNAME:[0-9]+]]} ; CODEGEN: ![[LOOPNAME]] = !{!"llvm.loop.id", !"This-is-the-unrolled-loop"} diff --git a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll index 3f6f50e34775d..b03d475dd42ee 100644 --- a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll +++ b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-vectorizer=stripmine -passes=polly-codegen-verify '-passes=polly-opt-isl,print<polly-ast>,polly-codegen' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-parallel -polly-vectorizer=stripmine -passes=polly-codegen-verify '-passes=polly-custom<opt-isl;ast;codegen>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; Check that there are no nested #pragma omp parallel for inside a ; #pragma omp parallel for loop. diff --git a/polly/test/ScheduleOptimizer/computeout.ll b/polly/test/ScheduleOptimizer/computeout.ll index a3286b481ffb3..6f34f4efc0a6d 100644 --- a/polly/test/ScheduleOptimizer/computeout.ll +++ b/polly/test/ScheduleOptimizer/computeout.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -disable-output < %s | FileCheck %s -; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT +; RUN: opt -S %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-isl-arg=--no-schedule-serialize-sccs -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-isl-arg=--no-schedule-serialize-sccs -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for(i = 0; i < 100; i++ ) diff --git a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll index 928ee858ae6d2..4be0b948d09a0 100644 --- a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll +++ b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll @@ -1,9 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-remarks-minimal \ -; RUN: '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=1 \ -; RUN: -polly-target-vector-register-bitwidth=4096 \ -; RUN: -polly-target-1st-cache-level-associativity=3 -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-remarks-minimal '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=1 -polly-target-vector-register-bitwidth=4096 -polly-target-1st-cache-level-associativity=3 -disable-output < %s | FileCheck %s ; ; /* Test that Polly does not crash due to configurations that can lead to ; incorrect tile size computations. diff --git a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll index b533cb870bdcb..548a8aa94afbf 100644 --- a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll +++ b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-opt-isl>)" -polly-vectorizer=stripmine -polly-invariant-load-hoisting -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -polly-vectorizer=stripmine -polly-invariant-load-hoisting -disable-output < %s | FileCheck %s ; ; llvm.org/PR46578 ; diff --git a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll index 3dd579ed736f7..6de5e3a606aa3 100644 --- a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll +++ b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll @@ -1,4 +1,4 @@ -; RUN: opt -S %loadNPMPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; CHECK: // 1st level tiling - Tiles ; CHECK-NEXT: #pragma known-parallel ; CHECK-NEXT: for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1) diff --git a/polly/test/ScheduleOptimizer/line-tiling-2.ll b/polly/test/ScheduleOptimizer/line-tiling-2.ll index 3a2c566d19d3d..6256adfcd6917 100644 --- a/polly/test/ScheduleOptimizer/line-tiling-2.ll +++ b/polly/test/ScheduleOptimizer/line-tiling-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-tile-sizes=1,64 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-tile-sizes=1,64 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 7; c1 += 1) diff --git a/polly/test/ScheduleOptimizer/line-tiling.ll b/polly/test/ScheduleOptimizer/line-tiling.ll index 0dbdeff4742b9..51e02594aa880 100644 --- a/polly/test/ScheduleOptimizer/line-tiling.ll +++ b/polly/test/ScheduleOptimizer/line-tiling.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-tile-sizes=64,1 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-tile-sizes=64,1 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; CHECK: for (int c0 = 0; c0 <= 15; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 511; c1 += 1) diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll index 8f270b94617fe..79deedc7cd830 100644 --- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll +++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll @@ -1,13 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: -polly-optimized-scops \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 -polly-optimized-scops -polly-target-vector-register-bitwidth=256 -disable-output < %s ; ; /* C := alpha*A*B + beta*C */ ; for (i = 0; i < _PB_NI; i++) diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll index de1c815f92350..e3ae1a02bd347 100644 --- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll +++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll @@ -1,12 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 -polly-target-vector-register-bitwidth=256 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; /* C := alpha*A*B + beta*C */ ; /* _PB_NK % Kc != 0 */ diff --git a/polly/test/ScheduleOptimizer/one-dimensional-band.ll b/polly/test/ScheduleOptimizer/one-dimensional-band.ll index a097d4a43cfd2..f37f1e5119a9f 100644 --- a/polly/test/ScheduleOptimizer/one-dimensional-band.ll +++ b/polly/test/ScheduleOptimizer/one-dimensional-band.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; void jacobi1d(long T, long N, float *A, float *B) { ; long t, i, j; diff --git a/polly/test/ScheduleOptimizer/outer_coincidence.ll b/polly/test/ScheduleOptimizer/outer_coincidence.ll index 7c1af80c9ffae..e0a7a63cda80d 100644 --- a/polly/test/ScheduleOptimizer/outer_coincidence.ll +++ b/polly/test/ScheduleOptimizer/outer_coincidence.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=OUTER +; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=OUTER ; By skewing, the diagonal can be made parallel. ISL does this when the Check ; the 'outer_coincidence' option is enabled. diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll index a19b93d9915dd..84f1ca0dba652 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll @@ -1,8 +1,4 @@ -; RUN: opt %loadNPMPolly \ -; RUN: -polly-pattern-matching-based-opts=true \ -; RUN: '-passes=polly-optree,polly-delicm,polly-simplify,polly-opt-isl' \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true '-passes=polly-custom<optree;delicm;simplify;opt-isl>' -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; Check that the pattern matching detects the matrix multiplication pattern diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll index 4ef0605a0ba75..72fb4f1b4e41c 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-opt-isl' \ -; RUN: -polly-pattern-matching-based-opts=true \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm;simplify-1;opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; Check that the pattern matching detects the tensor contraction pattern diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll index 09118e252233b..933b2d4d258e7 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll @@ -1,8 +1,7 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=false \ -; RUN: -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS -; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=false -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines ; REQUIRES: asserts ; ; /* C := alpha*A*B + beta*C */ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll index b771d1f87537e..03e23038877e5 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll @@ -1,16 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl' \ -; RUN: -polly-import-jscop-postfix=transformed \ -; RUN: -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: -debug \ -; RUN: -polly-tc-opt=true -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;opt-isl>' -polly-import-jscop-postfix=transformed -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; Check that the pattern matching detects the matrix multiplication pattern diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll index 238f6dd798e68..4e174e3c9723d 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll @@ -1,12 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: -passes=polly-opt-isl -disable-output < %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl>' -disable-output < %s ; ; Test whether isolation works as expected. ; diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll index 0e4540eb7ba3c..c3d8b6ed3fee5 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll @@ -1,12 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=2 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=128 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=2 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=128 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; Test whether isolation works as expected. ; diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll index 9678ad83ff048..3705c3fd27ed9 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll @@ -1,13 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl,polly-codegen' \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: -polly-import-jscop-postfix=transformed -S < %s \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;opt-isl;ast;codegen>' -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; Check that we disable the Loop Vectorizer. ; diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll index e74884d59c311..7ada105828b27 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -debug-only=polly-opt-isl -disable-output \ -; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug-only=polly-opt-isl -disable-output -polly-tc-opt=true < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < _PB_NI; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll index 9c99a090b69e7..6647380b2d070 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 1024; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll index 8e14035ce8629..fba77d5e4f82d 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 32; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll index 4f562c306f96a..488436064ae83 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 32; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll index 32ded897d4ff9..c7a5d475bef31 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 8; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll index f0c0177da84b0..1dba8bece8072 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; /* C := alpha*A*B + beta*C */ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll index 155177bdfade0..3656a9457cef2 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 16; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll index 3d21ac3859a7e..bd0cb054957af 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (int i = 0; i < 32; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll index 00a4bf885aef8..6e6788be2973f 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (int i = 0; i < 32; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll index bfe5c5249a3a8..82356ae0a398d 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -passes=polly-opt-isl \ -; RUN: -polly-pattern-matching-based-opts=true -polly-tc-opt=true \ -; RUN: -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 1024; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll index a2e1ced3e6320..ea28bb8c0bdb6 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (int i = 0; i < 32; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll index 9844d377e609d..f80d63cd4d66c 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -1,19 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-size=0 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-size=0 -polly-target-vector-register-bitwidth=256 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL ; ; /* C := alpha*A*B + beta*C */ ; for (i = 0; i < _PB_NI; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll index 250641d57bac5..100b17e2ccd21 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll @@ -1,13 +1,5 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ -; RUN: -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: -polly-tc-opt=true -disable-output < %s | \ -; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -polly-tc-opt=true -disable-output < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS ; REQUIRES: asserts ; ; C := A * B + C diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll index ad2c195ba1e8e..050af1b2377d3 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll @@ -1,12 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ ; -polly-target-throughput-vector-fma=1 \ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll index 1d3cdbdbfdd85..ba1ddfef6a4e4 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll @@ -1,12 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ ; -polly-target-throughput-vector-fma=1 \ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll index 59eaa4a0928e9..e50b3a0a3f2b0 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll @@ -1,12 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; /* C := A * B + C */ ; /* Elements of the matrices A, B, C have the float type. */ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll index 2544d502a2dc5..3f57fe8cf6c73 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll @@ -1,12 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; /* C := A * B + C */ ; /* Elements of the matrices B, C have the double type. */ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll index 85c143562f5af..b87ed4fb1ec3c 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll @@ -1,14 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ -; RUN: -polly-target-throughput-vector-fma=1 \ -; RUN: -polly-target-latency-vector-fma=8 \ -; RUN: -polly-target-1st-cache-level-associativity=8 \ -; RUN: -polly-target-2nd-cache-level-associativity=8 \ -; RUN: -polly-target-1st-cache-level-size=32768 \ -; RUN: -polly-target-vector-register-bitwidth=256 \ -; RUN: -polly-target-2nd-cache-level-size=262144 \ -; RUN: -passes=polly-opt-isl -disable-output < %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl>' -disable-output < %s ; -; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES +; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES ; ; /* C := A * B + C */ ; /* Elements of the matrices A, B, C have the char type. */ diff --git a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll index 64285891a16c7..98c1db6d36fbe 100644 --- a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll +++ b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -passes=polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;opt-isl>' -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; void pattern_matching_based_opts_splitmap(double C[static const restrict 2][2], double A[static const restrict 2][784], double B[static const restrict 784][2]) { diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll index 1c6d289744e39..4784dc88cd307 100644 --- a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll +++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll @@ -1,4 +1,4 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @C = common global [1536 x [1536 x float]] zeroinitializer, align 16 diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll index 1ff20d165ce5e..6d1592c4ba8fa 100644 --- a/polly/test/ScheduleOptimizer/prevectorization.ll +++ b/polly/test/ScheduleOptimizer/prevectorization.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=VEC16 +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=VEC16 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll index 0bc3c2cf642e8..f346e5365b198 100644 --- a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll +++ b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine -passes=polly-opt-isl -polly-debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl>' -polly-debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts define void @ham(ptr %arg, ptr %arg1, i32 %arg2, i32 %arg3, ptr %arg4, i32 %arg5, i32 %arg6) { diff --git a/polly/test/ScheduleOptimizer/rectangular-tiling.ll b/polly/test/ScheduleOptimizer/rectangular-tiling.ll index e1d768b351d7d..3fd4907909419 100644 --- a/polly/test/ScheduleOptimizer/rectangular-tiling.ll +++ b/polly/test/ScheduleOptimizer/rectangular-tiling.ll @@ -1,7 +1,7 @@ -; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-tiling=false '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=NOTILING -; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL -; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER +; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-tiling=false '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=NOTILING +; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL +; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER ; CHECK: // 1st level tiling - Tiles ; CHECK: for (int c0 = 0; c0 <= 3; c0 += 1) diff --git a/polly/test/ScheduleOptimizer/schedule_computeout.ll b/polly/test/ScheduleOptimizer/schedule_computeout.ll index 1e1359e3ecc6a..1ee8a90473bd3 100644 --- a/polly/test/ScheduleOptimizer/schedule_computeout.ll +++ b/polly/test/ScheduleOptimizer/schedule_computeout.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -passes=polly-optree -passes=polly-delicm -passes=polly-opt-isl -polly-schedule-computeout=10000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly-custom<optree;delicm;opt-isl>' -polly-schedule-computeout=10000 -debug-only=polly-opt-isl < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; Bailout if the computations of schedule compute exceeds the max scheduling quota. diff --git a/polly/test/ScheduleOptimizer/statistics.ll b/polly/test/ScheduleOptimizer/statistics.ll index 84eb59341d273..bb705ac6abf38 100644 --- a/polly/test/ScheduleOptimizer/statistics.ll +++ b/polly/test/ScheduleOptimizer/statistics.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; REQUIRES: asserts diff --git a/polly/test/ScheduleOptimizer/tile_after_fusion.ll b/polly/test/ScheduleOptimizer/tile_after_fusion.ll index 50a46d66176ea..e3d7c24ebef77 100644 --- a/polly/test/ScheduleOptimizer/tile_after_fusion.ll +++ b/polly/test/ScheduleOptimizer/tile_after_fusion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-isl-arg=--no-schedule-serialize-sccs '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-isl-arg=--no-schedule-serialize-sccs '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s ; ; ; void tf(int C[256][256][256], int A0[256][256][256], int A1[256][256][256]) { diff --git a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll index e59a31665d77b..bb472b9c3763f 100644 --- a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll +++ b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s ; isl_schedule_node_band_sink may sink into multiple children. ; https://llvm.org/PR52637 diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll index cee1c06cf7aa0..d83c822371b6e 100644 --- a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll +++ b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: ; diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll index 5506b3c626cfd..63c9addd0b6e1 100644 --- a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll +++ b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: ; diff --git a/polly/test/ScopDetect/aliasing_simple_1.ll b/polly/test/ScopDetect/aliasing_simple_1.ll index 5f43ec1856a7f..ea8a7688f3d25 100644 --- a/polly/test/ScopDetect/aliasing_simple_1.ll +++ b/polly/test/ScopDetect/aliasing_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: ; diff --git a/polly/test/ScopDetect/aliasing_simple_2.ll b/polly/test/ScopDetect/aliasing_simple_2.ll index e853dfcc64485..df68289ff7352 100644 --- a/polly/test/ScopDetect/aliasing_simple_2.ll +++ b/polly/test/ScopDetect/aliasing_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: ; diff --git a/polly/test/ScopDetect/base_pointer.ll b/polly/test/ScopDetect/base_pointer.ll index e500f9bc20bc6..0f0e219bd90d1 100644 --- a/polly/test/ScopDetect/base_pointer.ll +++ b/polly/test/ScopDetect/base_pointer.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -disable-basic-aa -polly-invariant-load-hoisting=true -polly-print-detect -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly --aa-pipeline= -polly-invariant-load-hoisting=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll index eeb9e11f812c3..b00ec77679063 100644 --- a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll +++ b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts;import-jscop>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; This violated an assertion in setNewAccessRelation that assumed base pointers ; to be load-hoisted. Without this assertion, it codegen would generate invalid diff --git a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll index 16976e6313275..1cd04b639fc99 100644 --- a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll +++ b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s --allow-empty +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;import-jscop>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --allow-empty ; ; Polly codegen used to generate invalid code (referring to %ptr from the ; original region) when regeneration of the access function is necessary. @@ -35,3 +35,5 @@ exit: ; CHECK-NOT: Valid Region for Scop +; CHECK: Detected Scops in Function base_pointer_is_inst_inside_invariant_1 +; CHECK-NOT: Valid Region for Scop diff --git a/polly/test/ScopDetect/callbr.ll b/polly/test/ScopDetect/callbr.ll index 4182974693678..4200339a04a13 100644 --- a/polly/test/ScopDetect/callbr.ll +++ b/polly/test/ScopDetect/callbr.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STAT +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-track-failures -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STAT ; REQUIRES: asserts ; REMARK: Branch from indirect terminator. diff --git a/polly/test/ScopDetect/collective_invariant_loads.ll b/polly/test/ScopDetect/collective_invariant_loads.ll index f451bccec706f..f5263e4e4c40a 100644 --- a/polly/test/ScopDetect/collective_invariant_loads.ll +++ b/polly/test/ScopDetect/collective_invariant_loads.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting -disable-output< %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting -disable-output < %s 2>&1 | FileCheck %s ;CHECK: Function: test_init_chpl ;CHECK-NEXT: Region: %bb1---%bb16 diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit.ll b/polly/test/ScopDetect/cross_loop_non_single_exit.ll index fe3922174c07c..d7605c36d449c 100644 --- a/polly/test/ScopDetect/cross_loop_non_single_exit.ll +++ b/polly/test/ScopDetect/cross_loop_non_single_exit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll index 4cac173932a6f..c3a2ad4791ba7 100644 --- a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll +++ b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll index 7d7476471bb6e..e896e18589e94 100644 --- a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll +++ b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" define void @f(ptr %A, i64 %N, i64 %M) nounwind { diff --git a/polly/test/ScopDetect/detect-full-functions.ll b/polly/test/ScopDetect/detect-full-functions.ll index 178ef32827cab..adad0e89ffa42 100644 --- a/polly/test/ScopDetect/detect-full-functions.ll +++ b/polly/test/ScopDetect/detect-full-functions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-process-unprofitable=false -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-process-unprofitable=false -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s ; Verify if a simple function with basic block not part of loop doesn't crash with polly-process-unprofitable=false and polly-detect-full-functions flags. diff --git a/polly/test/ScopDetect/dom-tree-crash.ll b/polly/test/ScopDetect/dom-tree-crash.ll index efc732c50e177..0f670ca230824 100644 --- a/polly/test/ScopDetect/dom-tree-crash.ll +++ b/polly/test/ScopDetect/dom-tree-crash.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Detected Scops in Function foo diff --git a/polly/test/ScopDetect/dot-scops-npm.ll b/polly/test/ScopDetect/dot-scops-npm.ll index d14bf8a23a166..de1f52813475a 100644 --- a/polly/test/ScopDetect/dot-scops-npm.ll +++ b/polly/test/ScopDetect/dot-scops-npm.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-scop-printer' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-dot -disable-output < %s ; RUN: FileCheck %s -input-file=scops.func_npm.dot ; ; Check that the ScopPrinter does not crash. diff --git a/polly/test/ScopDetect/dot-scops.ll b/polly/test/ScopDetect/dot-scops.ll index 63163b23617cf..a719d21300b15 100644 --- a/polly/test/ScopDetect/dot-scops.ll +++ b/polly/test/ScopDetect/dot-scops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,polly-scop-printer' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; ; Check that the ScopPrinter does not crash. ; ScopPrinter needs the ScopDetection pass, which should depend on diff --git a/polly/test/ScopDetect/error-block-always-executed.ll b/polly/test/ScopDetect/error-block-always-executed.ll index 20d02b1c1ae0b..0e82e37d10095 100644 --- a/polly/test/ScopDetect/error-block-always-executed.ll +++ b/polly/test/ScopDetect/error-block-always-executed.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid Region for Scop: diff --git a/polly/test/ScopDetect/error-block-referenced-from-scop.ll b/polly/test/ScopDetect/error-block-referenced-from-scop.ll index 6c66f6df14af5..338fe20679bcf 100644 --- a/polly/test/ScopDetect/error-block-referenced-from-scop.ll +++ b/polly/test/ScopDetect/error-block-referenced-from-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid Region for Scop: diff --git a/polly/test/ScopDetect/error-block-unreachable.ll b/polly/test/ScopDetect/error-block-unreachable.ll index 6ba7698a972bb..85f248da9be18 100644 --- a/polly/test/ScopDetect/error-block-unreachable.ll +++ b/polly/test/ScopDetect/error-block-unreachable.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s ; Verify that the scop detection does not crash on inputs with unreachable ; blocks. Earlier we crashed when detecting error blocks. diff --git a/polly/test/ScopDetect/expand-region-correctly-2.ll b/polly/test/ScopDetect/expand-region-correctly-2.ll index a5c9626d28361..43fdda8321cbe 100644 --- a/polly/test/ScopDetect/expand-region-correctly-2.ll +++ b/polly/test/ScopDetect/expand-region-correctly-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer ; diff --git a/polly/test/ScopDetect/expand-region-correctly.ll b/polly/test/ScopDetect/expand-region-correctly.ll index a8c90c08fde0c..b4caac4478d1d 100644 --- a/polly/test/ScopDetect/expand-region-correctly.ll +++ b/polly/test/ScopDetect/expand-region-correctly.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer diff --git a/polly/test/ScopDetect/ignore_func_flag_regex.ll b/polly/test/ScopDetect/ignore_func_flag_regex.ll index a75e705995a75..ef1c66686251a 100644 --- a/polly/test/ScopDetect/ignore_func_flag_regex.ll +++ b/polly/test/ScopDetect/ignore_func_flag_regex.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-ignore-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-polly-ignore-func=f.*,g.*' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the flag `-polly-ignore-func` works with regexes. ; diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop.ll b/polly/test/ScopDetect/index_from_unpredictable_loop.ll index f6d6cfab0eede..a6f7079f68407 100644 --- a/polly/test/ScopDetect/index_from_unpredictable_loop.ll +++ b/polly/test/ScopDetect/index_from_unpredictable_loop.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll index 16d47619b0ff2..be76e0b138933 100644 --- a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll +++ b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopDetect/indvars.ll b/polly/test/ScopDetect/indvars.ll index 3fbc4d65bbe20..e45e4fb016155 100644 --- a/polly/test/ScopDetect/indvars.ll +++ b/polly/test/ScopDetect/indvars.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopDetect/intrinsics_1.ll b/polly/test/ScopDetect/intrinsics_1.ll index 58c9197f7f799..43fa4ca619ed7 100644 --- a/polly/test/ScopDetect/intrinsics_1.ll +++ b/polly/test/ScopDetect/intrinsics_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: for.cond => for.end ; diff --git a/polly/test/ScopDetect/intrinsics_2.ll b/polly/test/ScopDetect/intrinsics_2.ll index f71016e6d04cd..b4cc3df7c746b 100644 --- a/polly/test/ScopDetect/intrinsics_2.ll +++ b/polly/test/ScopDetect/intrinsics_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we allow the lifetime markers for the tmp array. ; diff --git a/polly/test/ScopDetect/intrinsics_3.ll b/polly/test/ScopDetect/intrinsics_3.ll index 579d5bd481d44..08fdee573ba0f 100644 --- a/polly/test/ScopDetect/intrinsics_3.ll +++ b/polly/test/ScopDetect/intrinsics_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we allow the misc intrinsics. ; diff --git a/polly/test/ScopDetect/invalid-latch-conditions.ll b/polly/test/ScopDetect/invalid-latch-conditions.ll index db4898c9c7bd7..c7d7c51e7d220 100644 --- a/polly/test/ScopDetect/invalid-latch-conditions.ll +++ b/polly/test/ScopDetect/invalid-latch-conditions.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NALOOPS -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NALOOPS +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; The latch conditions of the outer loop are not affine, thus the loop cannot ; handled by the domain generation and needs to be overapproximated. diff --git a/polly/test/ScopDetect/invalidate_scalar_evolution.ll b/polly/test/ScopDetect/invalidate_scalar_evolution.ll index ddef510ad4d9f..977918eb5168d 100644 --- a/polly/test/ScopDetect/invalidate_scalar_evolution.ll +++ b/polly/test/ScopDetect/invalidate_scalar_evolution.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/invariant-load-before-scop.ll b/polly/test/ScopDetect/invariant-load-before-scop.ll index 10479643959cb..932c218170caf 100644 --- a/polly/test/ScopDetect/invariant-load-before-scop.ll +++ b/polly/test/ScopDetect/invariant-load-before-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; The LoadInst %.b761 is defined outside the SCoP, hence is always constant ; within it. It is no "required invariant load". diff --git a/polly/test/ScopDetect/keep_going_expansion.ll b/polly/test/ScopDetect/keep_going_expansion.ll index 074aae9ae95c9..efd81c695ca0d 100644 --- a/polly/test/ScopDetect/keep_going_expansion.ll +++ b/polly/test/ScopDetect/keep_going_expansion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-detect-track-failures -polly-detect-keep-going '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-detect-track-failures -polly-detect-keep-going '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopDetect/mod_ref_read_pointer.ll b/polly/test/ScopDetect/mod_ref_read_pointer.ll index 64535d85f2ab1..c7972cc47a68d 100644 --- a/polly/test/ScopDetect/mod_ref_read_pointer.ll +++ b/polly/test/ScopDetect/mod_ref_read_pointer.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=MODREF -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=MODREF +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid Region for Scop: for.body => for.end ; MODREF: Valid Region for Scop: for.body => for.end diff --git a/polly/test/ScopDetect/more-than-one-loop.ll b/polly/test/ScopDetect/more-than-one-loop.ll index 30090652326d2..1835342812b1f 100644 --- a/polly/test/ScopDetect/more-than-one-loop.ll +++ b/polly/test/ScopDetect/more-than-one-loop.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Valid Region for Scop: diff --git a/polly/test/ScopDetect/multidim-with-undef-size.ll b/polly/test/ScopDetect/multidim-with-undef-size.ll index 2a5f8b15534fa..e89cea98ad21a 100644 --- a/polly/test/ScopDetect/multidim-with-undef-size.ll +++ b/polly/test/ScopDetect/multidim-with-undef-size.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: Valid Region for Scop: bb14 => bb17 diff --git a/polly/test/ScopDetect/multidim.ll b/polly/test/ScopDetect/multidim.ll index 91202373263f0..cbe7d0708b853 100644 --- a/polly/test/ScopDetect/multidim.ll +++ b/polly/test/ScopDetect/multidim.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: Valid Region for Scop: bb19 => bb20 diff --git a/polly/test/ScopDetect/multidim_indirect_access.ll b/polly/test/ScopDetect/multidim_indirect_access.ll index a9cd446d27670..4af37ba064558 100644 --- a/polly/test/ScopDetect/multidim_indirect_access.ll +++ b/polly/test/ScopDetect/multidim_indirect_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we will recognize this SCoP. ; diff --git a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll index 9c91fbfbe0b64..0286642f3c7a7 100644 --- a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll +++ b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/ScopDetect/nested_loop_single_exit.ll b/polly/test/ScopDetect/nested_loop_single_exit.ll index a0742112b6e12..89071df596807 100644 --- a/polly/test/ScopDetect/nested_loop_single_exit.ll +++ b/polly/test/ScopDetect/nested_loop_single_exit.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s ; void f(long A[], long N) { ; long i, j; diff --git a/polly/test/ScopDetect/non-affine-conditional.ll b/polly/test/ScopDetect/non-affine-conditional.ll index e74619cd87756..b20828d9a7679 100644 --- a/polly/test/ScopDetect/non-affine-conditional.ll +++ b/polly/test/ScopDetect/non-affine-conditional.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/ScopDetect/non-affine-float-compare.ll b/polly/test/ScopDetect/non-affine-float-compare.ll index 9326cd4290380..77427397bac9d 100644 --- a/polly/test/ScopDetect/non-affine-float-compare.ll +++ b/polly/test/ScopDetect/non-affine-float-compare.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; void f(float *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll index 1ab6b35ae93f1..f6ae9fe8dd544 100644 --- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll +++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll @@ -1,7 +1,7 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; ; Here we have a non-affine loop but also a non-affine access which should ; be rejected as long as -polly-allow-nonaffine isn't given. diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll index 921f6ab535499..23c1765caecac 100644 --- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll +++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES ; ; Here we have a non-affine loop (in the context of the loop nest) ; and also a non-affine access (A[k]). While we can always detect the diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll index 78774d92e0a46..6e239a6570668 100644 --- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll +++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES ; ; Here we have a non-affine loop (in the context of the loop nest) ; and also a non-affine access (A[k]). While we can always detect the diff --git a/polly/test/ScopDetect/non-affine-loop.ll b/polly/test/ScopDetect/non-affine-loop.ll index 5136b3b8779b1..dd675ccec5999 100644 --- a/polly/test/ScopDetect/non-affine-loop.ll +++ b/polly/test/ScopDetect/non-affine-loop.ll @@ -1,8 +1,8 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; ; This function/region does contain a loop, however it is non-affine, hence the access ; A[i] is also. Furthermore, it is the only loop, thus when we over approximate diff --git a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll index fd52c5df7b27e..63b1cdb420b71 100644 --- a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll +++ b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid ; diff --git a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll index d0c1f7a613332..ff4ad3218ffa5 100644 --- a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll +++ b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Valid Region for Scop: bb11 => bb25 diff --git a/polly/test/ScopDetect/non-simple-memory-accesses.ll b/polly/test/ScopDetect/non-simple-memory-accesses.ll index bdc48984f9961..5b9ed2b2ecae7 100644 --- a/polly/test/ScopDetect/non-simple-memory-accesses.ll +++ b/polly/test/ScopDetect/non-simple-memory-accesses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we do not model atomic memory accesses. We did not reason about ; how to handle them correctly and the Alias Set Tracker models some of them diff --git a/polly/test/ScopDetect/non_affine_loop_condition.ll b/polly/test/ScopDetect/non_affine_loop_condition.ll index 63bd7b3a2f1f2..3c487374c1973 100644 --- a/polly/test/ScopDetect/non_affine_loop_condition.ll +++ b/polly/test/ScopDetect/non_affine_loop_condition.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; ; void f(int *A) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/ScopDetect/only-one-affine-loop.ll b/polly/test/ScopDetect/only-one-affine-loop.ll index 1d36f4df35bc3..a8ce5bc636833 100644 --- a/polly/test/ScopDetect/only-one-affine-loop.ll +++ b/polly/test/ScopDetect/only-one-affine-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; Even if we allow non-affine loops we can only model the outermost loop, all ; other loops are boxed in non-affine regions. However, the inner loops can be diff --git a/polly/test/ScopDetect/only_func_flag.ll b/polly/test/ScopDetect/only_func_flag.ll index 4742375fec5cf..f4f35048fa8a0 100644 --- a/polly/test/ScopDetect/only_func_flag.ll +++ b/polly/test/ScopDetect/only_func_flag.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-only-func=f,g '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-only-func=f,g '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the flag `-polly-only-func` limits analysis to `f` and `g`. ; diff --git a/polly/test/ScopDetect/only_func_flag_regex.ll b/polly/test/ScopDetect/only_func_flag_regex.ll index 2ad22c9f7a7f5..f180fa765f4b0 100644 --- a/polly/test/ScopDetect/only_func_flag_regex.ll +++ b/polly/test/ScopDetect/only_func_flag_regex.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-only-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-polly-only-func=f.*,g.*' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the flag `-polly-only-func` works with regexes. ; diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll index 271825a58c399..71d1ba0accd32 100644 --- a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll +++ b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: Valid Region diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev.ll b/polly/test/ScopDetect/parametric-multiply-in-scev.ll index 2ab8997c63331..6768c969a7428 100644 --- a/polly/test/ScopDetect/parametric-multiply-in-scev.ll +++ b/polly/test/ScopDetect/parametric-multiply-in-scev.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; foo(float *A, long n, long k) { ; if (true) diff --git a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll index 248bb43aacd98..2e16b75ee3106 100644 --- a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll +++ b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; Region with an exit node that has a PHI node multiple incoming edges from ; inside the region. Motivation for supporting such cases in Polly. diff --git a/polly/test/ScopDetect/profitability-large-basic-blocks.ll b/polly/test/ScopDetect/profitability-large-basic-blocks.ll index d74185b45c752..ac27016e3622d 100644 --- a/polly/test/ScopDetect/profitability-large-basic-blocks.ll +++ b/polly/test/ScopDetect/profitability-large-basic-blocks.ll @@ -1,12 +1,8 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \ -; RUN: -polly-detect-profitability-min-per-loop-insts=40 \ -; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-detect-profitability-min-per-loop-insts=40 '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE -; RUN: opt %loadNPMPolly -polly-process-unprofitable=true \ -; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE +; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \ -; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNPROFITABLE +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNPROFITABLE ; UNPROFITABLE-NOT: Valid Region for Scop: ; PROFITABLE: Valid Region for Scop: diff --git a/polly/test/ScopDetect/profitability-two-nested-loops.ll b/polly/test/ScopDetect/profitability-two-nested-loops.ll index 0291d3be452a1..80379bcc5d412 100644 --- a/polly/test/ScopDetect/profitability-two-nested-loops.ll +++ b/polly/test/ScopDetect/profitability-two-nested-loops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Valid Region for Scop: next => bb3 ; diff --git a/polly/test/ScopDetect/remove_all_children.ll b/polly/test/ScopDetect/remove_all_children.ll index d95e9bde0b384..1c77d730ed418 100644 --- a/polly/test/ScopDetect/remove_all_children.ll +++ b/polly/test/ScopDetect/remove_all_children.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/polly/test/ScopDetect/report-scop-location.ll b/polly/test/ScopDetect/report-scop-location.ll index 5e4c38db5e53c..530a22f9ac3d4 100644 --- a/polly/test/ScopDetect/report-scop-location.ll +++ b/polly/test/ScopDetect/report-scop-location.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-report -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-report -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable diff --git a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll index f49190b33ccf7..2ade0a97a5991 100644 --- a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll +++ b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: Valid Region for Scop: target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopDetect/run_time_alias_check.ll b/polly/test/ScopDetect/run_time_alias_check.ll index 74cbedb34e5c6..6f327e318082c 100644 --- a/polly/test/ScopDetect/run_time_alias_check.ll +++ b/polly/test/ScopDetect/run_time_alias_check.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/polly/test/ScopDetect/scev_remove_max.ll b/polly/test/ScopDetect/scev_remove_max.ll index f76c832ff08f5..4f03845795c9c 100644 --- a/polly/test/ScopDetect/scev_remove_max.ll +++ b/polly/test/ScopDetect/scev_remove_max.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s ; This test case helps to determine whether SCEVRemoveMax::remove produces ; an infinite loop and a segmentation fault, if it processes, for example, diff --git a/polly/test/ScopDetect/sequential_loops.ll b/polly/test/ScopDetect/sequential_loops.ll index 4a84f356f3e81..338a9ae6b6b0e 100644 --- a/polly/test/ScopDetect/sequential_loops.ll +++ b/polly/test/ScopDetect/sequential_loops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/polly/test/ScopDetect/simple_loop.ll b/polly/test/ScopDetect/simple_loop.ll index 33823b21fb8fb..5da4898517e22 100644 --- a/polly/test/ScopDetect/simple_loop.ll +++ b/polly/test/ScopDetect/simple_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/simple_loop_non_single_entry.ll b/polly/test/ScopDetect/simple_loop_non_single_entry.ll index 1bba2c21c7473..00e11ab252e73 100644 --- a/polly/test/ScopDetect/simple_loop_non_single_entry.ll +++ b/polly/test/ScopDetect/simple_loop_non_single_entry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit.ll b/polly/test/ScopDetect/simple_loop_non_single_exit.ll index 93ec84e911c5d..9f75b80f58cef 100644 --- a/polly/test/ScopDetect/simple_loop_non_single_exit.ll +++ b/polly/test/ScopDetect/simple_loop_non_single_exit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll index 33b0d8d7d6fc0..c6ce482403400 100644 --- a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll +++ b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll index 9b47b7c946caf..c90c4915e866d 100644 --- a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll +++ b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/simple_loop_with_param.ll b/polly/test/ScopDetect/simple_loop_with_param.ll index 4a0a3adab661d..67f677892313c 100644 --- a/polly/test/ScopDetect/simple_loop_with_param.ll +++ b/polly/test/ScopDetect/simple_loop_with_param.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI ; void f(long A[], long N, long *init_ptr) { ; long i, j; diff --git a/polly/test/ScopDetect/simple_loop_with_param_2.ll b/polly/test/ScopDetect/simple_loop_with_param_2.ll index 670936b6fee80..9e7b55efc48d9 100644 --- a/polly/test/ScopDetect/simple_loop_with_param_2.ll +++ b/polly/test/ScopDetect/simple_loop_with_param_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; diff --git a/polly/test/ScopDetect/simple_non_single_entry.ll b/polly/test/ScopDetect/simple_non_single_entry.ll index 6ace3b636019b..e56c022aa5466 100644 --- a/polly/test/ScopDetect/simple_non_single_entry.ll +++ b/polly/test/ScopDetect/simple_non_single_entry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/skip_function_attribute.ll b/polly/test/ScopDetect/skip_function_attribute.ll index 2150a3e8c35dd..789942a950051 100644 --- a/polly/test/ScopDetect/skip_function_attribute.ll +++ b/polly/test/ScopDetect/skip_function_attribute.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; Verify polly skips this function ; diff --git a/polly/test/ScopDetect/srem_with_parametric_divisor.ll b/polly/test/ScopDetect/srem_with_parametric_divisor.ll index 66c3b045f62a4..471602968055e 100644 --- a/polly/test/ScopDetect/srem_with_parametric_divisor.ll +++ b/polly/test/ScopDetect/srem_with_parametric_divisor.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid Region for Scop: ; diff --git a/polly/test/ScopDetect/statistics.ll b/polly/test/ScopDetect/statistics.ll index a1dcebec63ff8..5d87599da29f7 100644 --- a/polly/test/ScopDetect/statistics.ll +++ b/polly/test/ScopDetect/statistics.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -stats -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -stats -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts diff --git a/polly/test/ScopDetect/switch-in-loop-patch.ll b/polly/test/ScopDetect/switch-in-loop-patch.ll index 2f9b670384db2..1e825f4950afa 100644 --- a/polly/test/ScopDetect/switch-in-loop-patch.ll +++ b/polly/test/ScopDetect/switch-in-loop-patch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: Valid diff --git a/polly/test/ScopDetect/tlr_is_hoistable_load.ll b/polly/test/ScopDetect/tlr_is_hoistable_load.ll index 5c33522f62325..24a3f55a519e2 100644 --- a/polly/test/ScopDetect/tlr_is_hoistable_load.ll +++ b/polly/test/ScopDetect/tlr_is_hoistable_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-detect-full-functions -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting -polly-detect-full-functions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s ; ; This testcase checks for compatibility of the -detect-full-functions ; flag in combination with the -invariant-load-hoisting option. More diff --git a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll index 4ae86a940e0c8..e7245d80b60ed 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s +; RUN: opt %loadNPMPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ;void f(int A[], int B[]) { ; for (int i=0; i<42; i++) diff --git a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll index adb14b5b017d4..2a0b281073f59 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK: remark: <unknown>:0:0: Scop contains function entry (not yet supported). diff --git a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll index 428a7cf855f6e..fc4c1fbcef484 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; #define N 1024 ; double invalidCall(double A[N]); diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll index 30e5fb9fdeba8..7a540d606eadf 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ;void foo(int a, int b) { diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll index 2bc515e0ae5e1..512366f1bc7ce 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK: remark: <unknown>:0:0: Irreducible region encountered in control flow. diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll index a96b64e4e0d54..e844aea24ac26 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll @@ -1,16 +1,6 @@ -; RUN: opt %loadNPMPolly \ -; RUN: -pass-remarks-missed="polly-detect" -polly-detect-track-failures \ -; RUN: -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output \ -; RUN: < %s 2>&1| FileCheck %s --check-prefix=REJECTNONAFFINELOOPS -; RUN: opt %loadNPMPolly \ -; RUN: -pass-remarks-missed="polly-detect" -polly-detect-track-failures \ -; RUN: -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output \ -; RUN: < %s 2>&1| FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS -; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \ -; RUN: -polly-process-unprofitable=false \ -; RUN: -polly-detect-track-failures -polly-allow-nonaffine-loops=true \ -; RUN: -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefix=ALLOWNONAFFINEALL +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-process-unprofitable=false -polly-detect-track-failures -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEALL ; void f(int A[], int n) { ; for (int i = 0; i < A[n+i]; i++) diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll index 6156efaea1909..d80911cc0ec9a 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll @@ -4,8 +4,8 @@ ; the PostDominatorTree. Infinite loops are postdominated only by the virtual ; root, which causes them not to appear in regions in ScopDetection anymore. -; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void func (int param0, int N, int *A) ; { diff --git a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll index dd95bd6ede715..d8c2916cc23bb 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll @@ -1,9 +1,9 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=ALL -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN-ALL -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1 | FileCheck %s -check-prefix=ALL +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DELIN +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-keep-going -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DELIN-ALL +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE ; 1 void manyaccesses(float A[restrict], long n, float B[restrict][n]) ; 2 { diff --git a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll index 13ac9d5ace2d3..ee0aa743f434b 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; void f(int A[]) { ; for(int i=0; i<42; ++i) diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll index 93e9e8b14038b..ad2c813c4b7ce 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll @@ -1,10 +1,6 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \ -; RUN: -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \ -; RUN: -polly-process-unprofitable=false < %s 2>&1| FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-process-unprofitable=false < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \ -; RUN: -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \ -; RUN: -polly-process-unprofitable=false < %s 2>&1 -pass-remarks-output=%t.yaml +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-process-unprofitable=false -pass-remarks-output=%t.yaml < %s 2>&1 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll index d110cfefc27dd..d97032c8f8eaf 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s \ -; RUN: -pass-remarks-missed="polly-detect" 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll index 5f296fae9532b..7a5025c0c2fbe 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s ; struct b { ; double **b; diff --git a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll index 3cdeed13ec285..e15c045907ddf 100644 --- a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll +++ b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output 2>&1 < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output 2>&1 < %s | FileCheck %s -match-full-lines ; ; Derived from test-suite/MultiSource/Benchmarks/BitBench/uuencode/uuencode.c ; diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll index 4a9a200d67dfd..b5918d9f7a2d4 100644 --- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll +++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region. diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll index 61ff033d9f934..502abf8dab6d7 100644 --- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll +++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region. ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region. diff --git a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll index c5efec3f50c58..accb562771819 100644 --- a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll +++ b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll @@ -1,5 +1,5 @@ ; This should be run without alias analysis enabled. -;RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" define i32 @main() nounwind { diff --git a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll index 81c7efb963652..57ae977a1a13f 100644 --- a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll +++ b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" declare void @foo() diff --git a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll index 5abf8ff29ef85..3cb63cc4f952c 100644 --- a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll +++ b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32" diff --git a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll index d16ba453f9815..668fcd8fabcaf 100644 --- a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll +++ b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/Alias-0.ll b/polly/test/ScopInfo/Alias-0.ll index ebbe744627ef8..50c1b65727eaf 100644 --- a/polly/test/ScopInfo/Alias-0.ll +++ b/polly/test/ScopInfo/Alias-0.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/Alias-1.ll b/polly/test/ScopInfo/Alias-1.ll index b1711c25857d0..15fd6c936fc47 100644 --- a/polly/test/ScopInfo/Alias-1.ll +++ b/polly/test/ScopInfo/Alias-1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/Alias-2.ll b/polly/test/ScopInfo/Alias-2.ll index b94f130c94ebd..598ad0fe8cf1c 100644 --- a/polly/test/ScopInfo/Alias-2.ll +++ b/polly/test/ScopInfo/Alias-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/Alias-3.ll b/polly/test/ScopInfo/Alias-3.ll index af7816546b4ab..388a2defec395 100644 --- a/polly/test/ScopInfo/Alias-3.ll +++ b/polly/test/ScopInfo/Alias-3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/Alias-4.ll b/polly/test/ScopInfo/Alias-4.ll index fe651c87b241c..e9f4f95a9997f 100644 --- a/polly/test/ScopInfo/Alias-4.ll +++ b/polly/test/ScopInfo/Alias-4.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA -; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA +; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA +; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/BoundChecks/single-loop.ll b/polly/test/ScopInfo/BoundChecks/single-loop.ll index 0b69beaaf3f9c..d44c18cf49e36 100644 --- a/polly/test/ScopInfo/BoundChecks/single-loop.ll +++ b/polly/test/ScopInfo/BoundChecks/single-loop.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; This only works after the post-dominator tree has been fixed. ; diff --git a/polly/test/ScopInfo/BoundChecks/two-loops.ll b/polly/test/ScopInfo/BoundChecks/two-loops.ll index f2ba17d33c0ea..9034f75f13792 100644 --- a/polly/test/ScopInfo/BoundChecks/two-loops.ll +++ b/polly/test/ScopInfo/BoundChecks/two-loops.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; This only works after the post-dominator tree has fixed. ; XFAIL: * diff --git a/polly/test/ScopInfo/NonAffine/div_backedge.ll b/polly/test/ScopInfo/NonAffine/div_backedge.ll index 3b0c673ece38b..e8edad9494075 100644 --- a/polly/test/ScopInfo/NonAffine/div_backedge.ll +++ b/polly/test/ScopInfo/NonAffine/div_backedge.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(float *A) { ; for (long i = 1;; i++) { diff --git a/polly/test/ScopInfo/NonAffine/div_domain.ll b/polly/test/ScopInfo/NonAffine/div_domain.ll index 34a5cecdfe3df..c195bb42dac9f 100644 --- a/polly/test/ScopInfo/NonAffine/div_domain.ll +++ b/polly/test/ScopInfo/NonAffine/div_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(float *A) { ; for (long i = 0; i < 16; i++) { diff --git a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll index 7d02fae7f98f3..31ecdaa0ef3e4 100644 --- a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll +++ b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int *B, int *C) { ; for (int i = 0; i < 1000; i++) diff --git a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll index d5c808d9021f2..e0cd1e51a095c 100644 --- a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll +++ b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Domain := ; CHECK: { Stmt_for_body[i0] : 0 <= i0 <= 6 }; diff --git a/polly/test/ScopInfo/NonAffine/modulo_domain.ll b/polly/test/ScopInfo/NonAffine/modulo_domain.ll index 13fe53f11633d..53bbe15799e61 100644 --- a/polly/test/ScopInfo/NonAffine/modulo_domain.ll +++ b/polly/test/ScopInfo/NonAffine/modulo_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; TODO: The new domain generation cannot handle modulo domain constraints, ; hence modulo handling has been disabled completely. Once this is diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll index 2b8427d74ec84..7d34ef9644b5a 100644 --- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll +++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALAR -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFIT +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALAR +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFIT ; ; SCALAR: Function: f ; SCALAR-NEXT: Region: %bb1---%bb13 diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll index 30f756e81e474..a40afdde1237f 100644 --- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll +++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL ; ; Here we have a non-affine loop (in the context of the loop nest) ; and also a non-affine access (A[k]). While we can always model the diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll index 6dacd719862ef..f3678d3245f57 100644 --- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll +++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL ; ; Here we have a non-affine loop (in the context of the loop nest) ; and also a non-affine access (A[k]). While we can always model the diff --git a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll index 8a13f791ed6de..85a1081159d59 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A) { ; for (int i = 0; i < 128; i++) diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll index 1e70d2c9db87e..65513a5d9d1fb 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_for_body diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll index dcfaa9280dcb8..0185774d6274c 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void pos(float *A, long n) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll index 24bfe60502163..ab47dc0b78260 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll index 931ad36d15f34..51a7d54562780 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll @@ -1,12 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -polly-allow-nonaffine-loops=true \ -; RUN: '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -polly-allow-nonaffine \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \ -; RUN: '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s \ -; RUN: --check-prefix=ALL +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-invariant-load-hoisting=true -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL ; ; Negative test for INNERMOST. ; At the moment we will optimistically assume A[i] in the conditional before the inner diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll index 37b51cebd74d5..b1f7e65e9dd25 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll @@ -1,16 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -polly-allow-nonaffine-loops=true \ -; RUN: '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -polly-allow-nonaffine \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \ -; RUN: '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL -; RUN: opt %loadNPMPolly -polly-allow-nonaffine \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -polly-process-unprofitable=false \ -; RUN: -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \ -; RUN: '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-invariant-load-hoisting=true -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-process-unprofitable=false -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; ; Negative test for INNERMOST. ; At the moment we will optimistically assume A[i] in the conditional before the inner diff --git a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll index 7bfd7f86efcdb..ac77dfb7454d3 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(float *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll index fc779d544e62f..db08544aa559c 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NO-REDUCTION +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NO-REDUCTION ; ; void f(int *A, int *C) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll index 63ff354d7e5f7..cde2dc495d549 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; ; Verify that we over approximate the read access of A[j] in the last statement as j is ; computed in a non-affine loop we do not model. diff --git a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll index d33befe2c66e0..ce4cc6189d45c 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, double A[], int INDEX[]) { diff --git a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll index 77c2df48d6514..b46ce87a45e2d 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-detect '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-detect '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll index 9ed340d1d304b..58e5ccd9b6e36 100644 --- a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll +++ b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; ; Regression test that triggered a memory leak at some point (24947). ; diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll index cbd024ba7a392..d94fc5f8a8823 100644 --- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll +++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that there is no alias group because we either access A or B never both. ; diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll index 3858d8a7bb1d6..df7f75dd8d95e 100644 --- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll +++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we create two alias groups since the minimal/maximal accesses ; depend on %b. diff --git a/polly/test/ScopInfo/aliasing_dead_access.ll b/polly/test/ScopInfo/aliasing_dead_access.ll index 7baa3dce1f9db..0ebc39c0e5a78 100644 --- a/polly/test/ScopInfo/aliasing_dead_access.ll +++ b/polly/test/ScopInfo/aliasing_dead_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not create a SCoP if there is no statement executed. ; diff --git a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll index 7265aab22a490..8e5bab661e18c 100644 --- a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll +++ b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll @@ -1,8 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \ -; RUN: < %s 2>&1 | FileCheck %s --check-prefix=FOUND -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \ -; RUN: -polly-rtc-max-arrays-per-group=3 < %s 2>&1 | FileCheck %s \ -; RUN: --check-prefix=IGNORED +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FOUND +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -polly-rtc-max-arrays-per-group=3 < %s 2>&1 | FileCheck %s --check-prefix=IGNORED ; ; FOUND: Function: foo ; IGNORED-NOT: Function: foo diff --git a/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll b/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll index c7592bcb09fcf..aec6ea0bf1441 100644 --- a/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll +++ b/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadPolly -polly-analysis-computeout=0 -polly-print-scops -polly-rtc-max-parameters=8 -disable-output < %s | FileCheck %s --check-prefix=MAX8 -; RUN: opt %loadPolly -polly-analysis-computeout=0 -polly-print-scops -polly-rtc-max-parameters=7 -disable-output < %s | FileCheck %s --check-prefix=MAX7 +; RUN: opt %loadNPMPolly -polly-analysis-computeout=0 '-passes=polly-custom<scops>' -polly-print-scops -polly-rtc-max-parameters=8 -disable-output < %s | FileCheck %s --check-prefix=MAX8 +; RUN: opt %loadNPMPolly -polly-analysis-computeout=0 '-passes=polly-custom<scops>' -polly-print-scops -polly-rtc-max-parameters=7 -disable-output < %s | FileCheck %s --check-prefix=MAX7 ; ; Check that we allow this SCoP even though it has 10 parameters involved in possibly aliasing accesses. ; However, only 7 are involved in accesses through B, 8 through C and none in accesses through A. diff --git a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll index d66a10bc511b1..a7dbe0baeae5d 100644 --- a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll +++ b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: { : } diff --git a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll index 9943802ec8595..db54a1687b4d5 100644 --- a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll +++ b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline= < %s 2>&1 | FileCheck %s --check-prefix=NOAA -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline=tbaa < %s 2>&1 | FileCheck %s --check-prefix=TBAA +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -aa-pipeline= < %s 2>&1 | FileCheck %s --check-prefix=NOAA +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -aa-pipeline=tbaa < %s 2>&1 | FileCheck %s --check-prefix=TBAA ; ; void jd(int *Int0, int *Int1, float *Float0, float *Float1) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll index 900d5d40d96f5..0001b8adb41e1 100644 --- a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll +++ b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s ; ; @test1 ; Make sure we generate the correct aliasing check for a fixed-size memset operation. diff --git a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll index 70c3c56fb3112..93253b7e65d4a 100644 --- a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll +++ b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll @@ -1,14 +1,9 @@ -; RUN: opt %loadNPMPolly -disable-output -polly-invariant-load-hoisting \ -; RUN: -polly-allow-dereference-of-all-function-parameters \ -; RUN: '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -disable-output -polly-invariant-load-hoisting -polly-allow-dereference-of-all-function-parameters '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \ -; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE-RTC +; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=CODE-RTC -; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \ -; RUN: -polly-allow-dereference-of-all-function-parameters \ -; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE +; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting -polly-allow-dereference-of-all-function-parameters '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=CODE ; SCOP: Function: hoge ; SCOP-NEXT: Region: %bb15---%bb37 diff --git a/polly/test/ScopInfo/assume_gep_bounds.ll b/polly/test/ScopInfo/assume_gep_bounds.ll index bd14e3868d525..994d49e5b887f 100644 --- a/polly/test/ScopInfo/assume_gep_bounds.ll +++ b/polly/test/ScopInfo/assume_gep_bounds.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void foo(float A[][20][30], long n, long m, long p) { ; for (long i = 0; i < n; i++) diff --git a/polly/test/ScopInfo/assume_gep_bounds_2.ll b/polly/test/ScopInfo/assume_gep_bounds_2.ll index 7a8c1870abe25..be43be598bd3d 100644 --- a/polly/test/ScopInfo/assume_gep_bounds_2.ll +++ b/polly/test/ScopInfo/assume_gep_bounds_2.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: -polly-precise-inbounds | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-precise-inbounds < %s 2>&1 | FileCheck %s ; ; void foo(float A[restrict][20], float B[restrict][20], long n, long m, ; long p) { diff --git a/polly/test/ScopInfo/assume_gep_bounds_many.ll b/polly/test/ScopInfo/assume_gep_bounds_many.ll index 01fc12cd7f108..cfd9008741c3a 100644 --- a/polly/test/ScopInfo/assume_gep_bounds_many.ll +++ b/polly/test/ScopInfo/assume_gep_bounds_many.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' -polly-ignore-aliasing \ -; RUN: < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing < %s 2>&1 | FileCheck %s ; CHECK: Assumed Context: ; CHECK-NEXT: [n1_a, n1_b, n1_c, n1_d, n2_a, n2_b, n2_c, n2_d, n3_a, n3_b, n3_c, n3_d, n4_a, n4_b, n4_c, n4_d, n5_a, n5_b, n5_c, n5_d, n6_a, n6_b, n6_c, n6_d, n7_a, n7_b, n7_c, n7_d, n8_a, n8_b, n8_c, n8_d, n9_a, n9_b, n9_c, n9_d, p1_b, p1_c, p1_d, p2_b, p2_c, p2_d, p3_b, p3_c, p3_d, p4_b, p4_c, p4_d, p5_b, p5_c, p5_d, p6_b, p6_c, p6_d, p7_b, p7_c, p7_d, p8_b, p8_c, p8_d, p9_b, p9_c, p9_d] -> { : p1_b >= n1_b and p1_c >= n1_c and p1_d >= n1_d and p2_b >= n2_b and p2_c >= n2_c and p2_d >= n2_d and p3_b >= n3_b and p3_c >= n3_c and p3_d >= n3_d and p4_b >= n4_b and p4_c >= n4_c and p4_d >= n4_d and p5_b >= n5_b and p5_c >= n5_c and p5_d >= n5_d and p6_b >= n6_b and p6_c >= n6_c and p6_d >= n6_d and p7_b >= n7_b and p7_c >= n7_c and p7_d >= n7_d and p8_b >= n8_b and p8_c >= n8_c and p8_d >= n8_d and p9_b >= n9_b and p9_c >= n9_c and p9_d >= n9_d } diff --git a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll index 3fb7a1329c745..b3aa7686d3010 100644 --- a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll +++ b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do no introduce a parameter here that is actually not needed. ; diff --git a/polly/test/ScopInfo/bool-addrec.ll b/polly/test/ScopInfo/bool-addrec.ll index 81fcade08f65a..01c6d52c30f76 100644 --- a/polly/test/ScopInfo/bool-addrec.ll +++ b/polly/test/ScopInfo/bool-addrec.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-ast>' -polly-process-unprofitable < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<ast>' -polly-print-ast -polly-process-unprofitable < %s 2>&1 | FileCheck %s ; CHECK: for (int c0 = 0; c0 <= 19999; c0 += 1) { ; CHECK-NEXT: if (c0 % 2 == 0) diff --git a/polly/test/ScopInfo/bounded_loop_assumptions.ll b/polly/test/ScopInfo/bounded_loop_assumptions.ll index 5628092de7765..21ba391f4fc1a 100644 --- a/polly/test/ScopInfo/bounded_loop_assumptions.ll +++ b/polly/test/ScopInfo/bounded_loop_assumptions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; The assumed context is tricky here as the equality test for the inner loop ; allows an "unbounded" loop trip count. We assume that does not happen, thus diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll index 83743e4e4ecc7..d25a8e666b525 100644 --- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll +++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll @@ -1,8 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | \ -; RUN: FileCheck %s -check-prefix=DETECT +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \ -; RUN: FileCheck %s -check-prefix=SCOP +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOP ; DETECT: Valid Region for Scop: loop => barrier ; DETECT-NEXT: Valid Region for Scop: branch => end diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll index 9685ba37a49a1..91aa96e0f3501 100644 --- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll +++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll @@ -1,8 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \ -; RUN: FileCheck %s -check-prefix=NONAFFINE -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-allow-nonaffine-branches=false < %s 2>&1 | \ -; RUN: FileCheck %s -check-prefix=NO-NONEAFFINE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine-branches=false < %s 2>&1 | FileCheck %s -check-prefix=NO-NONEAFFINE ; NONAFFINE: Statements { ; NONAFFINE-NEXT: Stmt_loop diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll index f41e6500fb30a..22a60c764eb4d 100644 --- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll +++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll @@ -1,8 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | \ -; RUN: FileCheck %s -check-prefix=NONAFFINE -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \ -; RUN: -polly-allow-nonaffine-branches=false < %s 2>&1 | \ -; RUN: FileCheck %s -check-prefix=NO-NONEAFFINE +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -polly-allow-nonaffine-branches=false < %s 2>&1 | FileCheck %s -check-prefix=NO-NONEAFFINE ; NONAFFINE-NOT: Statements diff --git a/polly/test/ScopInfo/bug_2010_10_22.ll b/polly/test/ScopInfo/bug_2010_10_22.ll index 71e7051922b53..1d248891dfd09 100644 --- a/polly/test/ScopInfo/bug_2010_10_22.ll +++ b/polly/test/ScopInfo/bug_2010_10_22.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/bug_2011_1_5.ll b/polly/test/ScopInfo/bug_2011_1_5.ll index f4a24e06f46ae..7c76c3eaa565a 100644 --- a/polly/test/ScopInfo/bug_2011_1_5.ll +++ b/polly/test/ScopInfo/bug_2011_1_5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; Bug description: Alias Analysis thinks IntToPtrInst aliases with alloca instructions created by IndependentBlocks Pass. ; This will trigger the assertion when we are verifying the SCoP after IndependentBlocks. diff --git a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll index ed6bbafdac1f0..6e1ef2339a81d 100644 --- a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll +++ b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | not FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | not FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @edge.8265 = external global [72 x i32], align 32 ; <ptr> [#uses=1] diff --git a/polly/test/ScopInfo/cfg_consequences.ll b/polly/test/ScopInfo/cfg_consequences.ll index 9161d3db4167a..2b702e235ca6c 100644 --- a/polly/test/ScopInfo/cfg_consequences.ll +++ b/polly/test/ScopInfo/cfg_consequences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void consequences(int *A, int bool_cond, int lhs, int rhs) { ; diff --git a/polly/test/ScopInfo/complex-branch-structure.ll b/polly/test/ScopInfo/complex-branch-structure.ll index de79c2226e68d..f48089afb93b9 100644 --- a/polly/test/ScopInfo/complex-branch-structure.ll +++ b/polly/test/ScopInfo/complex-branch-structure.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; We build a scop of the following form to check that the domain construction ; does not take a huge amount of time, but that we instead just bail out. diff --git a/polly/test/ScopInfo/complex-condition.ll b/polly/test/ScopInfo/complex-condition.ll index c3b8d2bb0ef88..9164959c1f6dc 100644 --- a/polly/test/ScopInfo/complex-condition.ll +++ b/polly/test/ScopInfo/complex-condition.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Low complexity assumption: { : false } ; diff --git a/polly/test/ScopInfo/complex-expression.ll b/polly/test/ScopInfo/complex-expression.ll index 4a2a1d2a64a6d..456edb04e0c2b 100644 --- a/polly/test/ScopInfo/complex-expression.ll +++ b/polly/test/ScopInfo/complex-expression.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; This test case has an SCEVSMax expression with a very high arity. The ; piecewise affine function we would create for it would have a huge amount of diff --git a/polly/test/ScopInfo/complex-loop-nesting.ll b/polly/test/ScopInfo/complex-loop-nesting.ll index 36cb078f19fff..4ffd8689f1a4a 100644 --- a/polly/test/ScopInfo/complex-loop-nesting.ll +++ b/polly/test/ScopInfo/complex-loop-nesting.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/complex-successor-structure-2.ll b/polly/test/ScopInfo/complex-successor-structure-2.ll index f4a78bf753853..32425d7598bc9 100644 --- a/polly/test/ScopInfo/complex-successor-structure-2.ll +++ b/polly/test/ScopInfo/complex-successor-structure-2.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; We build a scop for the region for.body->B13. The CFG is of the following ; form and the branch conditions are build from "smax" SCEVs. However, in diff --git a/polly/test/ScopInfo/complex-successor-structure-3.ll b/polly/test/ScopInfo/complex-successor-structure-3.ll index 6da1fe3a8b9f3..c01eca534bcf1 100644 --- a/polly/test/ScopInfo/complex-successor-structure-3.ll +++ b/polly/test/ScopInfo/complex-successor-structure-3.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; ; Check that propagation of domains from A(X) to A(X+1) will keep the ; domains small and concise. diff --git a/polly/test/ScopInfo/complex-successor-structure.ll b/polly/test/ScopInfo/complex-successor-structure.ll index 6c87ba3e98505..1b39f4cf192eb 100644 --- a/polly/test/ScopInfo/complex-successor-structure.ll +++ b/polly/test/ScopInfo/complex-successor-structure.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; We build a scop from the region for.body->B13. The CFG is of the ; following form. The test checks that the condition construction does not take diff --git a/polly/test/ScopInfo/complex_domain_binary_condition.ll b/polly/test/ScopInfo/complex_domain_binary_condition.ll index 6e28c9dfee06a..42a114eaa6ec1 100644 --- a/polly/test/ScopInfo/complex_domain_binary_condition.ll +++ b/polly/test/ScopInfo/complex_domain_binary_condition.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Low complexity assumption: { : false } ; diff --git a/polly/test/ScopInfo/complex_execution_context.ll b/polly/test/ScopInfo/complex_execution_context.ll index 9880a1dd67d19..9896fba8904b8 100644 --- a/polly/test/ScopInfo/complex_execution_context.ll +++ b/polly/test/ScopInfo/complex_execution_context.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Low complexity assumption: ; diff --git a/polly/test/ScopInfo/cond_constant_in_loop.ll b/polly/test/ScopInfo/cond_constant_in_loop.ll index 552fddc6ff08c..ecc2767fd6ecd 100644 --- a/polly/test/ScopInfo/cond_constant_in_loop.ll +++ b/polly/test/ScopInfo/cond_constant_in_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ;void f(long a[], long N, long M) { ; long i, j, k; diff --git a/polly/test/ScopInfo/cond_in_loop.ll b/polly/test/ScopInfo/cond_in_loop.ll index c06dcd955bac1..0f31904133719 100644 --- a/polly/test/ScopInfo/cond_in_loop.ll +++ b/polly/test/ScopInfo/cond_in_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ;void f(long a[], long N, long M) { ; long i, j, k; diff --git a/polly/test/ScopInfo/condition-after-error-block-2.ll b/polly/test/ScopInfo/condition-after-error-block-2.ll index 8c4b2170ad69b..257b2ede236d9 100644 --- a/polly/test/ScopInfo/condition-after-error-block-2.ll +++ b/polly/test/ScopInfo/condition-after-error-block-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; Verify that we do not allow PHI nodes such as %phi, if they reference an error ; block and are used by anything else than a terminator instruction. diff --git a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll index d5069da916fa1..d86b48ed24963 100644 --- a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll +++ b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/ScopInfo/condtion-after-error-block.ll b/polly/test/ScopInfo/condtion-after-error-block.ll index d9de4fc40a208..8ad98b4a4a78e 100644 --- a/polly/test/ScopInfo/condtion-after-error-block.ll +++ b/polly/test/ScopInfo/condtion-after-error-block.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; Verify that we allow scops containing uniform branch conditions, where all ; but one incoming block comes from an error condition. diff --git a/polly/test/ScopInfo/const_srem_sdiv.ll b/polly/test/ScopInfo/const_srem_sdiv.ll index b4c2f119fe053..b50c4bd910dda 100644 --- a/polly/test/ScopInfo/const_srem_sdiv.ll +++ b/polly/test/ScopInfo/const_srem_sdiv.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; ; See http://research.microsoft.com/pubs/151917/divmodnote-letter.pdf ; diff --git a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll index 86dd94e3371b2..f09f82f32c93a 100644 --- a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll +++ b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; At some point this caused a problem in the domain generation as we ; assumed any constant branch condition to be valid. However, only constant diff --git a/polly/test/ScopInfo/constant_factor_in_parameter.ll b/polly/test/ScopInfo/constant_factor_in_parameter.ll index b58d413e074e7..26c73bd72271b 100644 --- a/polly/test/ScopInfo/constant_factor_in_parameter.ll +++ b/polly/test/ScopInfo/constant_factor_in_parameter.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s ; ; Check that the constant part of the N * M * 4 expression is not part of the ; parameter but explicit in the access function. This can avoid existentially diff --git a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll index 62e6cd4641de1..762132f9edd78 100644 --- a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll +++ b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/polly/test/ScopInfo/constant_start_integer.ll b/polly/test/ScopInfo/constant_start_integer.ll index 8991f8250f0b7..6d17288b28227 100644 --- a/polly/test/ScopInfo/constant_start_integer.ll +++ b/polly/test/ScopInfo/constant_start_integer.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(float *input) { diff --git a/polly/test/ScopInfo/debug_call.ll b/polly/test/ScopInfo/debug_call.ll index a6761ecebe6a7..63c1baca5accc 100644 --- a/polly/test/ScopInfo/debug_call.ll +++ b/polly/test/ScopInfo/debug_call.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-debug-func=dbg_printf '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-debug-func=dbg_printf '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Check that the call to dbg_printf is accepted as a debug-function. ; diff --git a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll index 676c8a27e5749..7126fb95cd00c 100644 --- a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll +++ b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void foo(long n, long m, long o, double A[n][m][o]) { ; for (long i = 0; i < n-3; i++) diff --git a/polly/test/ScopInfo/div_by_zero.ll b/polly/test/ScopInfo/div_by_zero.ll index aecd16833b84e..62a13de7ceac0 100644 --- a/polly/test/ScopInfo/div_by_zero.ll +++ b/polly/test/ScopInfo/div_by_zero.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll index a3ca59563ab1f..333175b417ade 100644 --- a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll +++ b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; Check that we do not crash on this input. Earlier this indeed crashed as ; we tried to model the access functions in an error block. diff --git a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll index a988b3f8c2b01..b111851939d06 100644 --- a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll +++ b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s ; ; void or(float *A, long n, long m) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/early_exit_for_complex_domains.ll b/polly/test/ScopInfo/early_exit_for_complex_domains.ll index 9a1edcbfb7796..3ee6ff7889c84 100644 --- a/polly/test/ScopInfo/early_exit_for_complex_domains.ll +++ b/polly/test/ScopInfo/early_exit_for_complex_domains.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; ; Check we do not crash. ; diff --git a/polly/test/ScopInfo/error-blocks-1.ll b/polly/test/ScopInfo/error-blocks-1.ll index 047b095a95947..902ea15752980 100644 --- a/polly/test/ScopInfo/error-blocks-1.ll +++ b/polly/test/ScopInfo/error-blocks-1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Context: ; CHECK-NEXT: [N] -> { : -2147483648 <= N <= 2147483647 } diff --git a/polly/test/ScopInfo/error-blocks-2.ll b/polly/test/ScopInfo/error-blocks-2.ll index 6fa12947540c0..613b00a1a9ba7 100644 --- a/polly/test/ScopInfo/error-blocks-2.ll +++ b/polly/test/ScopInfo/error-blocks-2.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/error-blocks-3.ll b/polly/test/ScopInfo/error-blocks-3.ll index e7643601356db..9521037888075 100644 --- a/polly/test/ScopInfo/error-blocks-3.ll +++ b/polly/test/ScopInfo/error-blocks-3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-scops -polly-detect-keep-going -polly-allow-nonaffine -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-detect-keep-going -polly-allow-nonaffine -disable-output < %s | FileCheck %s ; ; The instruction ; diff --git a/polly/test/ScopInfo/escaping_empty_scop.ll b/polly/test/ScopInfo/escaping_empty_scop.ll index 2efaef3fb99b8..d47b2865b4ee0 100644 --- a/polly/test/ScopInfo/escaping_empty_scop.ll +++ b/polly/test/ScopInfo/escaping_empty_scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void g(); ; int f(int *A) { diff --git a/polly/test/ScopInfo/exit-phi-1.ll b/polly/test/ScopInfo/exit-phi-1.ll index cbd6c280e8caa..21f13cf4f4e4d 100644 --- a/polly/test/ScopInfo/exit-phi-1.ll +++ b/polly/test/ScopInfo/exit-phi-1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; Check for correct code generation of exit PHIs, even if the same PHI value ; is used again inside the the SCoP. diff --git a/polly/test/ScopInfo/exit-phi-2.ll b/polly/test/ScopInfo/exit-phi-2.ll index 695c617b14c1f..b8da9ab5b64f9 100644 --- a/polly/test/ScopInfo/exit-phi-2.ll +++ b/polly/test/ScopInfo/exit-phi-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that there is no MK_ExitPHI READ access. ; diff --git a/polly/test/ScopInfo/exit_phi_accesses-2.ll b/polly/test/ScopInfo/exit_phi_accesses-2.ll index b3b7cb1c65993..928b564c7cef5 100644 --- a/polly/test/ScopInfo/exit_phi_accesses-2.ll +++ b/polly/test/ScopInfo/exit_phi_accesses-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK-LABEL: Function: foo ; diff --git a/polly/test/ScopInfo/exit_phi_accesses.ll b/polly/test/ScopInfo/exit_phi_accesses.ll index 77b038ec8e4af..a54ca4a185ae2 100644 --- a/polly/test/ScopInfo/exit_phi_accesses.ll +++ b/polly/test/ScopInfo/exit_phi_accesses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; Check that PHI nodes only create PHI access and nothing else (e.g. unnecessary ; SCALAR accesses). In this case, for a PHI in the exit node, hence there is no diff --git a/polly/test/ScopInfo/expensive-boundary-context.ll b/polly/test/ScopInfo/expensive-boundary-context.ll index 95212f83acdca..c0d2dcd16289d 100644 --- a/polly/test/ScopInfo/expensive-boundary-context.ll +++ b/polly/test/ScopInfo/expensive-boundary-context.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \ -; RUN: < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: Assumed Context: target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll index 5e833e7ae0f4f..2f446b630168a 100644 --- a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll +++ b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; CHECK: Valid Region for Scop: bb10 => bb16 diff --git a/polly/test/ScopInfo/full-function.ll b/polly/test/ScopInfo/full-function.ll index 596c3d0af66a9..20cb137181697 100644 --- a/polly/test/ScopInfo/full-function.ll +++ b/polly/test/ScopInfo/full-function.ll @@ -1,7 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-detect-full-functions < %s 2>&1 \ -; RUN: | FileCheck %s -check-prefix=FULL -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s -check-prefix=WITHOUT-FULL +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s -check-prefix=FULL +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=WITHOUT-FULL ; FULL: Region: %bb---FunctionExit ; FULL: Statements { diff --git a/polly/test/ScopInfo/granularity_same_name.ll b/polly/test/ScopInfo/granularity_same_name.ll index 17f75fbf8a979..638b09879ce39 100644 --- a/polly/test/ScopInfo/granularity_same_name.ll +++ b/polly/test/ScopInfo/granularity_same_name.ll @@ -1,7 +1,7 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=0 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=1 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB ; ; Check that the statement has the same name, regardless of how the ; basic block is split into multiple statements. diff --git a/polly/test/ScopInfo/granularity_scalar-indep.ll b/polly/test/ScopInfo/granularity_scalar-indep.ll index 5c4484f9d4579..f4d864d2c6543 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Split a block into two independent statements that share no scalar. ; This case has the instructions of the two statements interleaved, such that diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll index 7ae0d961b38fb..f2c37f6293d62 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out ; before the PHI WRITEs to ensure that the value when entering the block is diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll index 7839e51c163ae..f7bd882da96e2 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out ; before the PHI WRITEs to ensure that the value when entering the block is diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll index 8643e85e05593..80aa9fb6deb7c 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Split a block into two independent statements that share no scalar. ; This case has an independent statement just for PHI writes. diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll index bc71cbe45cd98..66ef9fa9429e9 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Check that the PHI Write of value that is defined in the same basic ; block is in the statement where it is defined. diff --git a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll index f3864bac519b9..3837219e5d818 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; This case has no explicit epilogue for PHI writes because it would ; have a scalar dependency to the previous statement. diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll index 43101a8a0abfc..c43ad76d079d8 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; This case should be split into two statements because {X[0], Y[0]} ; and {A[0], B[0]} do not intersect. diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll index 4974f7e9b28ca..cfa7739d743f7 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; This case cannot be split into two statements because the order of ; loads and store would be violated. diff --git a/polly/test/ScopInfo/i1_params.ll b/polly/test/ScopInfo/i1_params.ll index be3e287372017..cf5b533c02682 100644 --- a/polly/test/ScopInfo/i1_params.ll +++ b/polly/test/ScopInfo/i1_params.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that both a signed as well as an unsigned extended i1 parameter ; is represented correctly. diff --git a/polly/test/ScopInfo/infeasible-rtc.ll b/polly/test/ScopInfo/infeasible-rtc.ll index 7a0bfe0fa4d84..9221ddf5fc910 100644 --- a/polly/test/ScopInfo/infeasible-rtc.ll +++ b/polly/test/ScopInfo/infeasible-rtc.ll @@ -1,8 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s -check-prefix=DETECT +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s -check-prefix=SCOPS +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/infeasible_invalid_context.ll b/polly/test/ScopInfo/infeasible_invalid_context.ll index 006901ab05b79..7ab6477460721 100644 --- a/polly/test/ScopInfo/infeasible_invalid_context.ll +++ b/polly/test/ScopInfo/infeasible_invalid_context.ll @@ -1,8 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s -check-prefix=DETECT +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s -check-prefix=SCOPS +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS ; DETECT: Valid Region for Scop: if.end116 => for.inc216 ; SCOPS-NOT: Statements diff --git a/polly/test/ScopInfo/int2ptr_ptr2int.ll b/polly/test/ScopInfo/int2ptr_ptr2int.ll index 578015aeecdc5..adefe794561c2 100644 --- a/polly/test/ScopInfo/int2ptr_ptr2int.ll +++ b/polly/test/ScopInfo/int2ptr_ptr2int.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=IR ; ; void f(long *A, long *ptr, long val) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll index 627524c0327dd..a88fcdc0f9b12 100644 --- a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll +++ b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll @@ -1,7 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S -passes=polly-codegen \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR ; ; void f(long *A, long *B, long *ptr, long val) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/integers.ll b/polly/test/ScopInfo/integers.ll index 4f6d1117e2bcc..5f89243be0e3b 100644 --- a/polly/test/ScopInfo/integers.ll +++ b/polly/test/ScopInfo/integers.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; Check that we correctly convert integers to isl values. diff --git a/polly/test/ScopInfo/inter-error-bb-dependence.ll b/polly/test/ScopInfo/inter-error-bb-dependence.ll index 761fcbbe3435e..0829f34be9791 100644 --- a/polly/test/ScopInfo/inter-error-bb-dependence.ll +++ b/polly/test/ScopInfo/inter-error-bb-dependence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 > /dev/null | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 > /dev/null | FileCheck %s ; ; Error statements (%bb33) do not require their uses to be verified. ; In this case it uses %tmp32 from %bb31 which is not available because diff --git a/polly/test/ScopInfo/inter_bb_scalar_dep.ll b/polly/test/ScopInfo/inter_bb_scalar_dep.ll index 7313618b082bc..f6406640dd2d8 100644 --- a/polly/test/ScopInfo/inter_bb_scalar_dep.ll +++ b/polly/test/ScopInfo/inter_bb_scalar_dep.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; diff --git a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll index d2ed3c17fe9dd..3150204cd9549 100644 --- a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll +++ b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ -; RUN: < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Statements { ; CHECK-NEXT: Stmt_loop__TO__backedge diff --git a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll index b3286cd2a7240..b0b63658caa55 100644 --- a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll +++ b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; diff --git a/polly/test/ScopInfo/intra_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_bb_scalar_dep.ll index 86855e7499a51..0ef6b2d35106b 100644 --- a/polly/test/ScopInfo/intra_bb_scalar_dep.ll +++ b/polly/test/ScopInfo/intra_bb_scalar_dep.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; diff --git a/polly/test/ScopInfo/intrinsics.ll b/polly/test/ScopInfo/intrinsics.ll index e6d9e733e35bf..e17d06f753a21 100644 --- a/polly/test/ScopInfo/intrinsics.ll +++ b/polly/test/ScopInfo/intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we remove the ignored intrinsics from the instruction list. ; diff --git a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll index 723942668d8c2..d3439d8d33662 100644 --- a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll +++ b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; ; This crashed at some point as we place %1 and %4 in the same equivalence class ; for invariant loads and when we remap SCEVs to use %4 instead of %1 AddRec SCEVs diff --git a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll index c493c22af32d9..ff5b0f601d03f 100644 --- a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll +++ b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; ; Check that no invalidated iterator is accessed while elements from ; the list of MemoryAccesses are removed. diff --git a/polly/test/ScopInfo/invariant-load-instlist.ll b/polly/test/ScopInfo/invariant-load-instlist.ll index ecb80e4054c35..1ec36e6d9d1b9 100644 --- a/polly/test/ScopInfo/invariant-load-instlist.ll +++ b/polly/test/ScopInfo/invariant-load-instlist.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; The load is a required invariant load and at the same time used in a store. ; Polly used to add two MemoryAccesses for it which caused an assertion to fail. diff --git a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll index 89eac6ce69a11..2d14287d4df44 100644 --- a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll +++ b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s ; CHECK: Statements { ; CHECK-NEXT: Stmt_L_4 diff --git a/polly/test/ScopInfo/invariant_load.ll b/polly/test/ScopInfo/invariant_load.ll index 9dc064276c40f..8974b7f7fb8cb 100644 --- a/polly/test/ScopInfo/invariant_load.ll +++ b/polly/test/ScopInfo/invariant_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll index 40aa3098683b3..7b5a7591813a6 100644 --- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll +++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; struct { ; int a; diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll index 287676024079c..0c2f57dfcb1c3 100644 --- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll +++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; struct { ; int a; diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll index cb745b4920b82..865bd789db6fb 100644 --- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll +++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; int U; ; void f(int *A) { diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll index fa5429d4803a8..f63fe9cc1f7c6 100644 --- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll +++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; int U; ; int f(int *A) { diff --git a/polly/test/ScopInfo/invariant_load_addrec_sum.ll b/polly/test/ScopInfo/invariant_load_addrec_sum.ll index 2e639f7d5e331..e70aa80ae6009 100644 --- a/polly/test/ScopInfo/invariant_load_addrec_sum.ll +++ b/polly/test/ScopInfo/invariant_load_addrec_sum.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Region: %entry.split---%if.end ; CHECK: Invariant Accesses: { diff --git a/polly/test/ScopInfo/invariant_load_base_pointer.ll b/polly/test/ScopInfo/invariant_load_base_pointer.ll index f2539af97a0b7..1176d1ca9db85 100644 --- a/polly/test/ScopInfo/invariant_load_base_pointer.ll +++ b/polly/test/ScopInfo/invariant_load_base_pointer.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll index f854b1f48ea92..81fd3b9559f43 100644 --- a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll +++ b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll index 5a9c5c6cabbe6..7313176aceed7 100644 --- a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll +++ b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_branch_condition.ll b/polly/test/ScopInfo/invariant_load_branch_condition.ll index d12750c30ba98..f6cadffe311e8 100644 --- a/polly/test/ScopInfo/invariant_load_branch_condition.ll +++ b/polly/test/ScopInfo/invariant_load_branch_condition.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll index 34d50a18663c4..76cc55767caca 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: -polly-invariant-load-hoisting \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s ; CHECK: Stmt_body1 ; CHECK-NEXT: Domain := diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll index 51f3cf6c095ac..9cc9391b6bc25 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: -polly-invariant-load-hoisting \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s ; Make sure we choose a canonical element that is not the first invariant load, ; but the first that is an array base pointer. diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll index 3a742bbccdf19..7f609f9a54689 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: -polly-invariant-load-hoisting \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s ; Verify that we canonicalize accesses even tough one of the accesses (even ; the canonical base) has a partial execution context. This is correct as diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll index 6bd8b3146e871..216e0760987cd 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: -polly-invariant-load-hoisting \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s ; Verify that a delinearized and a not delinearized access are not ; canonicalized. diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll index cb7e5646fc2b0..5da3d0ceb2d0f 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: -polly-invariant-load-hoisting \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s ; Verify that two arrays delinearized with different sizes are not coalesced. diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll index 6f7fbacc089cb..b71a092a2d468 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: -polly-invariant-load-hoisting \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s ; Verify that arrays with different element types are not coalesced. diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll index 445832822bdf0..2c4683ea5ce96 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ -; RUN: -polly-invariant-load-hoisting \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s ; Verify that nested arrays with invariant base pointers are handled correctly. ; Specifically, we currently do not canonicalize arrays where some accesses are diff --git a/polly/test/ScopInfo/invariant_load_complex_condition.ll b/polly/test/ScopInfo/invariant_load_complex_condition.ll index 11e7088d68dbd..e6ea032004a96 100644 --- a/polly/test/ScopInfo/invariant_load_complex_condition.ll +++ b/polly/test/ScopInfo/invariant_load_complex_condition.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -S '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -S '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/invariant_load_condition.ll b/polly/test/ScopInfo/invariant_load_condition.ll index c7d7b3c9ba611..8b1dc8be87c86 100644 --- a/polly/test/ScopInfo/invariant_load_condition.ll +++ b/polly/test/ScopInfo/invariant_load_condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_dereferenceable.ll b/polly/test/ScopInfo/invariant_load_dereferenceable.ll index 526bdc6ddb3bd..fc5527c48c411 100644 --- a/polly/test/ScopInfo/invariant_load_dereferenceable.ll +++ b/polly/test/ScopInfo/invariant_load_dereferenceable.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect;scops>' -polly-print-detect -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: Function: foo_undereferanceable diff --git a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll index eb148063320e7..b5525a8e2639e 100644 --- a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll +++ b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not consolidate the invariant loads to smp[order - 1] and ; smp[order - 2] in the blocks %0 and %16. While they have the same pointer diff --git a/polly/test/ScopInfo/invariant_load_in_non_affine.ll b/polly/test/ScopInfo/invariant_load_in_non_affine.ll index 5261113f5a0cf..69a7932fd3f58 100644 --- a/polly/test/ScopInfo/invariant_load_in_non_affine.ll +++ b/polly/test/ScopInfo/invariant_load_in_non_affine.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid Region for Scop ; diff --git a/polly/test/ScopInfo/invariant_load_loop_ub.ll b/polly/test/ScopInfo/invariant_load_loop_ub.ll index ee889e6c4d5a1..9258d75f6e294 100644 --- a/polly/test/ScopInfo/invariant_load_loop_ub.ll +++ b/polly/test/ScopInfo/invariant_load_loop_ub.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll index 6af7caecc0b37..50b0103b73efb 100644 --- a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll +++ b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s ; ; Note: The order of the invariant accesses is important because A is the ; base pointer of tmp3 and we will generate code in the same order as diff --git a/polly/test/ScopInfo/invariant_load_scalar_dep.ll b/polly/test/ScopInfo/invariant_load_scalar_dep.ll index 319f24bdcb920..ae1423e1e5f05 100644 --- a/polly/test/ScopInfo/invariant_load_scalar_dep.ll +++ b/polly/test/ScopInfo/invariant_load_scalar_dep.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_stmt_domain.ll b/polly/test/ScopInfo/invariant_load_stmt_domain.ll index 715948062c055..8062d875b1174 100644 --- a/polly/test/ScopInfo/invariant_load_stmt_domain.ll +++ b/polly/test/ScopInfo/invariant_load_stmt_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; This test case verifies that the statement domain of the invariant access ; is the universe. In earlier versions of Polly, we accidentally computed an diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll index a6108320d5608..9ee4a54168a68 100644 --- a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll +++ b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s +; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s ; ; Stress test for the code generation of invariant accesses. ; diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter.ll b/polly/test/ScopInfo/invariant_load_zext_parameter.ll index e3c183aab5e26..5bd2c51d86fa6 100644 --- a/polly/test/ScopInfo/invariant_load_zext_parameter.ll +++ b/polly/test/ScopInfo/invariant_load_zext_parameter.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; void f(int *I0, int *I1, int *V) { ; for (int i = 0; i < 1000; i++) { diff --git a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll index b5168e912ed74..426c14c191dd1 100644 --- a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll +++ b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s ; ; CHECK: Execution Context: [p_0_loaded_from_currpc] -> { : } ; diff --git a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll index 85360821078dc..77f74df7d7b21 100644 --- a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll +++ b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll index 134eac22bff5c..f18534d5bee24 100644 --- a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll +++ b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Negative test. If we assume UB[*V] to be invariant we get a cyclic ; dependence in the invariant loads that needs to be resolved by diff --git a/polly/test/ScopInfo/invariant_loop_bounds.ll b/polly/test/ScopInfo/invariant_loop_bounds.ll index f22199cfe4942..dcf7f50eb27c4 100644 --- a/polly/test/ScopInfo/invariant_loop_bounds.ll +++ b/polly/test/ScopInfo/invariant_loop_bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll index e3292b4e4aefa..df5798638ba7c 100644 --- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll +++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we only have one parameter and one invariant load for all ; three loads that occur in the region but actually access the same diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll index d69438de5817f..3d8c232c75970 100644 --- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll +++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we only have one parameter and one invariant load for all ; three loads that occur in the region but actually access the same diff --git a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll index 2df96faf76249..965531f20b01d 100644 --- a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll +++ b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s 2>&1 +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s 2>&1 ; Used to fail with: ; ../../isl/isl_aff.c:591: position out of bounds diff --git a/polly/test/ScopInfo/isl_trip_count_01.ll b/polly/test/ScopInfo/isl_trip_count_01.ll index 480b6e9574a66..79621ce64bbcc 100644 --- a/polly/test/ScopInfo/isl_trip_count_01.ll +++ b/polly/test/ScopInfo/isl_trip_count_01.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: [M, N] -> { Stmt_while_body[i0] : i0 > 0 and 4i0 <= -M + N; Stmt_while_body[0] }; ; diff --git a/polly/test/ScopInfo/isl_trip_count_02.ll b/polly/test/ScopInfo/isl_trip_count_02.ll index b78fb838edd0f..3052299277844 100644 --- a/polly/test/ScopInfo/isl_trip_count_02.ll +++ b/polly/test/ScopInfo/isl_trip_count_02.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; TODO: We do not allow unbounded loops at the moment. ; diff --git a/polly/test/ScopInfo/isl_trip_count_03.ll b/polly/test/ScopInfo/isl_trip_count_03.ll index 96df05f89bcff..52fde263d6898 100644 --- a/polly/test/ScopInfo/isl_trip_count_03.ll +++ b/polly/test/ScopInfo/isl_trip_count_03.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Test comes from a bug (15771) or better a feature request. It was not allowed ; in Polly in the old domain generation as ScalarEvolution cannot figure out the diff --git a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll index fd310ececaa38..657b8f6dc64e1 100644 --- a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll +++ b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/licm_load.ll b/polly/test/ScopInfo/licm_load.ll index ade640976d007..8f1cf4fa8fd91 100644 --- a/polly/test/ScopInfo/licm_load.ll +++ b/polly/test/ScopInfo/licm_load.ll @@ -1,7 +1,4 @@ -; RUN: opt %loadNPMPolly -passes='loop(loop-rotate,indvars),polly-prepare,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s -; RUN: opt %loadNPMPolly -passes='loop-mssa(loop-rotate,indvars,licm),polly-prepare,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(int n, float A[static const restrict n], ; float B[static const restrict n], int j) { @@ -14,26 +11,30 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define void @foo(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %j) { entry: %tmp = sext i32 %n to i64 - br label %for.cond + %cmp1 = icmp slt i64 0, %tmp + br i1 %cmp1, label %for.body.lr.ph, label %for.end -for.cond: ; preds = %for.inc, %entry - %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] - %cmp = icmp slt i64 %indvars.iv, %tmp - br i1 %cmp, label %for.body, label %for.end - -for.body: ; preds = %for.cond +for.body.lr.ph: ; preds = %entry %idxprom = sext i32 %j to i64 %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom %tmp2 = load i32, ptr %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.inc + %indvars.iv2 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ] + %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv2 store i32 %tmp2, ptr %arrayidx2, align 4 br label %for.inc for.inc: ; preds = %for.body - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - br label %for.cond + %indvars.iv.next = add nuw nsw i64 %indvars.iv2, 1 + %exitcond = icmp ne i64 %indvars.iv.next, %tmp + br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.inc + br label %for.end -for.end: ; preds = %for.cond +for.end: ; preds = %for.cond.for.end_crit_edge, %entry ret void } diff --git a/polly/test/ScopInfo/licm_potential_store.ll b/polly/test/ScopInfo/licm_potential_store.ll index 8a36ee84313a2..cbd8e410ed7c8 100644 --- a/polly/test/ScopInfo/licm_potential_store.ll +++ b/polly/test/ScopInfo/licm_potential_store.ll @@ -1,10 +1,4 @@ -; RUN: opt %loadNPMPolly -passes='sroa,instcombine,simplifycfg,reassociate,loop(loop-rotate),instcombine,indvars,polly-prepare,print<polly-function-scops>' \ -; RUN: -tailcallopt -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefix=NOLICM - -; RUN: opt %loadNPMPolly -passes='sroa,instcombine,simplifycfg,reassociate,loop(loop-rotate),instcombine,indvars,loop-mssa(licm),polly-prepare,print<polly-function-scops>' \ -; RUN: -tailcallopt -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefix=LICM +; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -tailcallopt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NOLICM ; void foo(int n, float A[static const restrict n], float x) { ; // (0) @@ -17,67 +11,40 @@ ; // (4) ; } -; LICM: Statements ; NOLICM: Statements target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define void @foo(i32 %n, ptr noalias nonnull %A, float %x) { entry: - %n.addr = alloca i32, align 4 - %A.addr = alloca ptr, align 8 - %x.addr = alloca float, align 4 - %i = alloca i32, align 4 - %j = alloca i32, align 4 - store i32 %n, ptr %n.addr, align 4 - store ptr %A, ptr %A.addr, align 8 - store float %x, ptr %x.addr, align 4 - %tmp = load i32, ptr %n.addr, align 4 - %tmp1 = zext i32 %tmp to i64 - store i32 0, ptr %i, align 4 - br label %for.cond - -for.cond: ; preds = %for.inc.4, %entry - %tmp2 = load i32, ptr %i, align 4 - %cmp = icmp slt i32 %tmp2, 5 - br i1 %cmp, label %for.body, label %for.end.6 + %smax = call i32 @llvm.smax.i32(i32 %n, i32 0) + %0 = add nuw i32 %smax, 1 + br label %for.cond.1.preheader -for.body: ; preds = %for.cond - store i32 0, ptr %j, align 4 +for.cond.1.preheader: ; preds = %entry, %for.end + %i.05 = phi i32 [ 0, %entry ], [ %add5, %for.end ] + %x.addr.04 = phi float [ %x, %entry ], [ %x.addr.1.lcssa, %for.end ] br label %for.cond.1 -for.cond.1: ; preds = %for.inc, %for.body - %tmp3 = load i32, ptr %j, align 4 - %tmp4 = load i32, ptr %n.addr, align 4 - %cmp2 = icmp slt i32 %tmp3, %tmp4 - br i1 %cmp2, label %for.body.3, label %for.end - -for.body.3: ; preds = %for.cond.1 - store float 7.000000e+00, ptr %x.addr, align 4 - br label %for.inc - -for.inc: ; preds = %for.body.3 - %tmp5 = load i32, ptr %j, align 4 - %add = add nsw i32 %tmp5, 1 - store i32 %add, ptr %j, align 4 - br label %for.cond.1 +for.cond.1: ; preds = %for.cond.1, %for.cond.1.preheader + %x.addr.1 = phi float [ 7.000000e+00, %for.cond.1 ], [ %x.addr.04, %for.cond.1.preheader ] + %j.0 = phi i32 [ %add, %for.cond.1 ], [ 0, %for.cond.1.preheader ] + %add = add nuw i32 %j.0, 1 + %exitcond = icmp ne i32 %add, %0 + br i1 %exitcond, label %for.cond.1, label %for.end for.end: ; preds = %for.cond.1 - %tmp6 = load float, ptr %x.addr, align 4 - %tmp7 = load ptr, ptr %A.addr, align 8 - store float %tmp6, ptr %tmp7, align 4 - br label %for.inc.4 - -for.inc.4: ; preds = %for.end - %tmp8 = load i32, ptr %i, align 4 - %add5 = add nsw i32 %tmp8, 1 - store i32 %add5, ptr %i, align 4 - br label %for.cond + %x.addr.1.lcssa = phi float [ %x.addr.1, %for.cond.1 ] + store float %x.addr.1.lcssa, ptr %A, align 4 + %add5 = add nuw nsw i32 %i.05, 1 + %exitcond6 = icmp ne i32 %add5, 5 + br i1 %exitcond6, label %for.cond.1.preheader, label %for.end.6 -for.end.6: ; preds = %for.cond +for.end.6: ; preds = %for.end ret void } -; CHECK: Statements { -; CHECK: Stmt_for_end -; CHECK: } +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/polly/test/ScopInfo/licm_potential_store_mssa.ll b/polly/test/ScopInfo/licm_potential_store_mssa.ll new file mode 100644 index 0000000000000..ce785d622fcb3 --- /dev/null +++ b/polly/test/ScopInfo/licm_potential_store_mssa.ll @@ -0,0 +1,50 @@ +; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -tailcallopt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=LICM + +; void foo(int n, float A[static const restrict n], float x) { +; // (0) +; for (int i = 0; i < 5; i += 1) { +; for (int j = 0; j < n; j += 1) { +; x = 7; // (1) +; } +; A[0] = x; // (3) +; } +; // (4) +; } + +; LICM: Statements + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @foo(i32 %n, ptr noalias nonnull %A, float %x) { +entry: + %smax = call i32 @llvm.smax.i32(i32 %n, i32 0) + br label %for.cond.1.preheader + +for.cond.1.preheader: ; preds = %for.end, %entry + %i.05 = phi i32 [ 0, %entry ], [ %add5, %for.end ] + %x.addr.04 = phi float [ %x, %entry ], [ %x.addr.1.lcssa, %for.end ] + br label %for.cond.1 + +for.cond.1: ; preds = %for.cond.1, %for.cond.1.preheader + %x.addr.1 = phi float [ 7.000000e+00, %for.cond.1 ], [ %x.addr.04, %for.cond.1.preheader ] + %j.0 = phi i32 [ %add, %for.cond.1 ], [ 0, %for.cond.1.preheader ] + %add = add nuw i32 %j.0, 1 + %exitcond.not = icmp eq i32 %j.0, %smax + br i1 %exitcond.not, label %for.end, label %for.cond.1 + +for.end: ; preds = %for.cond.1 + %x.addr.1.lcssa = phi float [ %x.addr.1, %for.cond.1 ] + %add5 = add nuw nsw i32 %i.05, 1 + %exitcond6.not = icmp eq i32 %add5, 5 + br i1 %exitcond6.not, label %for.end.6, label %for.cond.1.preheader + +for.end.6: ; preds = %for.end + %x.addr.1.lcssa.lcssa = phi float [ %x.addr.1.lcssa, %for.end ] + store float %x.addr.1.lcssa.lcssa, ptr %A, align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.smax.i32(i32, i32) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/polly/test/ScopInfo/licm_reduction_nested.ll b/polly/test/ScopInfo/licm_reduction_nested.ll index c1676033fa909..50625b2ddabde 100644 --- a/polly/test/ScopInfo/licm_reduction_nested.ll +++ b/polly/test/ScopInfo/licm_reduction_nested.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -licm -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars '-passes=polly-custom<prepare;scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -licm '-passes=polly-custom<prepare;scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; XFAIL: * ; diff --git a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll index f102518da5261..8225bd04fce63 100644 --- a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll +++ b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; Verify that the compilation of this test case does not take infinite time. ; At some point Polly tried to model this test case and got stuck in diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll index e32748a4bbb57..064a0d3e700b9 100644 --- a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll +++ b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll index b32b87b5c3f3a..edaadd61dc020 100644 --- a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll +++ b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll index 431c907857fec..391f0ec8c0f59 100644 --- a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll +++ b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s --check-prefix=IR ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/loop_affine_bound_0.ll b/polly/test/ScopInfo/loop_affine_bound_0.ll index 918d4099740ce..fcd56613fc095 100644 --- a/polly/test/ScopInfo/loop_affine_bound_0.ll +++ b/polly/test/ScopInfo/loop_affine_bound_0.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void f(long a[][128], long N, long M) { ; long i, j; diff --git a/polly/test/ScopInfo/loop_affine_bound_1.ll b/polly/test/ScopInfo/loop_affine_bound_1.ll index 8f7a87f1c5ac4..392509871a9b7 100644 --- a/polly/test/ScopInfo/loop_affine_bound_1.ll +++ b/polly/test/ScopInfo/loop_affine_bound_1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ;void f(long a[][128], long N, long M) { ; long i, j; diff --git a/polly/test/ScopInfo/loop_affine_bound_2.ll b/polly/test/ScopInfo/loop_affine_bound_2.ll index 2d9f997a0767f..665dc1ad244d9 100644 --- a/polly/test/ScopInfo/loop_affine_bound_2.ll +++ b/polly/test/ScopInfo/loop_affine_bound_2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void f(long a[][128], long N, long M) { ; long i, j; diff --git a/polly/test/ScopInfo/loop_carry.ll b/polly/test/ScopInfo/loop_carry.ll index 20ebbfbc8b49c..579f43d874577 100644 --- a/polly/test/ScopInfo/loop_carry.ll +++ b/polly/test/ScopInfo/loop_carry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/many-scalar-dependences.ll b/polly/test/ScopInfo/many-scalar-dependences.ll index 5b003325ef0fb..ddad36065a5c8 100644 --- a/polly/test/ScopInfo/many-scalar-dependences.ll +++ b/polly/test/ScopInfo/many-scalar-dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(float a[100][100]) { ; float x; diff --git a/polly/test/ScopInfo/max-loop-depth.ll b/polly/test/ScopInfo/max-loop-depth.ll index 71e9c02aa8dcc..f33933210247d 100644 --- a/polly/test/ScopInfo/max-loop-depth.ll +++ b/polly/test/ScopInfo/max-loop-depth.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void bar(); ; void foo(int *A, int *B, long int N, long int M) { diff --git a/polly/test/ScopInfo/memcpy-raw-source.ll b/polly/test/ScopInfo/memcpy-raw-source.ll index 6c45b0d41b76b..149a2fcfea772 100644 --- a/polly/test/ScopInfo/memcpy-raw-source.ll +++ b/polly/test/ScopInfo/memcpy-raw-source.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa,scoped-noalias-aa,tbaa '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa,scoped-noalias-aa,tbaa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; ; Ensure that ScopInfo's alias analysis llvm.memcpy for, ; like the AliasSetTracker, preserves bitcasts. diff --git a/polly/test/ScopInfo/memcpy.ll b/polly/test/ScopInfo/memcpy.ll index 95c455f097b21..6b7a9e2edffbe 100644 --- a/polly/test/ScopInfo/memcpy.ll +++ b/polly/test/ScopInfo/memcpy.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s ; ; CHECK: Arrays { ; CHECK-NEXT: i8 MemRef_A[*]; // Element size 1 diff --git a/polly/test/ScopInfo/memmove.ll b/polly/test/ScopInfo/memmove.ll index 8ff471a11cd17..aba886b59d1d5 100644 --- a/polly/test/ScopInfo/memmove.ll +++ b/polly/test/ScopInfo/memmove.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s ; ; CHECK: Arrays { ; CHECK-NEXT: i8 MemRef_A[*]; // Element size 1 diff --git a/polly/test/ScopInfo/memset.ll b/polly/test/ScopInfo/memset.ll index 89b0487728210..7eaec7bd1ad6a 100644 --- a/polly/test/ScopInfo/memset.ll +++ b/polly/test/ScopInfo/memset.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s +; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s ; ; CHECK: Arrays { ; CHECK-NEXT: i8 MemRef_A[*]; // Element size 1 diff --git a/polly/test/ScopInfo/memset_null.ll b/polly/test/ScopInfo/memset_null.ll index 9755cf1129e68..7bd3e90b3aa82 100644 --- a/polly/test/ScopInfo/memset_null.ll +++ b/polly/test/ScopInfo/memset_null.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-modref-calls -S -passes=polly-codegen < %s +; RUN: opt %loadNPMPolly -polly-allow-modref-calls '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-modref-calls -S '-passes=polly<no-default-opts>' < %s ; ; Verify we can handle a memset to "null" and that we do not model it. ; TODO: FIXME: We could use the undefined memset to optimize the code further, diff --git a/polly/test/ScopInfo/mismatching-array-dimensions.ll b/polly/test/ScopInfo/mismatching-array-dimensions.ll index f825cbff1ec56..cd12421344f7f 100644 --- a/polly/test/ScopInfo/mismatching-array-dimensions.ll +++ b/polly/test/ScopInfo/mismatching-array-dimensions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: AssumedContext diff --git a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll index 6bc5f8d8eb73f..1e289425e86d7 100644 --- a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll +++ b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll @@ -1,7 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb -passes=polly-codegen -polly-allow-modref-calls \ -; RUN: -disable-output < %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-allow-modref-calls -disable-output < %s ; ; Verify that we model the may-write access of the prefetch intrinsic ; correctly, thus that A is accessed by it but B is not. diff --git a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll index 21322bc648f8e..0b6e64da437fd 100644 --- a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll +++ b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll @@ -1,7 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \ -; RUN: -polly-allow-modref-calls < %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -disable-output -polly-allow-modref-calls < %s ; ; Verify that we model the read access of the gcread intrinsic ; correctly, thus that A is read by it but B is not. diff --git a/polly/test/ScopInfo/mod_ref_read_pointer.ll b/polly/test/ScopInfo/mod_ref_read_pointer.ll index 25e56a08a961b..25d59d9f7fd16 100644 --- a/polly/test/ScopInfo/mod_ref_read_pointer.ll +++ b/polly/test/ScopInfo/mod_ref_read_pointer.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly<no-default-opts>' -disable-output < %s ; ; Check that we assume the call to func has a read on the whole A array. ; diff --git a/polly/test/ScopInfo/mod_ref_read_pointers.ll b/polly/test/ScopInfo/mod_ref_read_pointers.ll index 5cc96cf3a06eb..f8cbb084aefe8 100644 --- a/polly/test/ScopInfo/mod_ref_read_pointers.ll +++ b/polly/test/ScopInfo/mod_ref_read_pointers.ll @@ -1,7 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \ -; RUN: -polly-allow-modref-calls < %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -disable-output -polly-allow-modref-calls < %s ; ; Check that the call to func will "read" not only the A array but also the ; B array. The reason is the readonly annotation of func. diff --git a/polly/test/ScopInfo/modulo_zext_1.ll b/polly/test/ScopInfo/modulo_zext_1.ll index 0a8957da4931a..a9b53d53aea7e 100644 --- a/polly/test/ScopInfo/modulo_zext_1.ll +++ b/polly/test/ScopInfo/modulo_zext_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/modulo_zext_2.ll b/polly/test/ScopInfo/modulo_zext_2.ll index 7af2411e7e8c4..f86ddcea9fe2b 100644 --- a/polly/test/ScopInfo/modulo_zext_2.ll +++ b/polly/test/ScopInfo/modulo_zext_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/modulo_zext_3.ll b/polly/test/ScopInfo/modulo_zext_3.ll index 1dac723aa2c23..21596d16a6e14 100644 --- a/polly/test/ScopInfo/modulo_zext_3.ll +++ b/polly/test/ScopInfo/modulo_zext_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/multi-scop.ll b/polly/test/ScopInfo/multi-scop.ll index c6dc1f201efa2..8647d89c91d7a 100644 --- a/polly/test/ScopInfo/multi-scop.ll +++ b/polly/test/ScopInfo/multi-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; This test case contains two scops. diff --git a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll index bd46532d87f10..8785458e42f2c 100644 --- a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll +++ b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll index cdd46304c932b..5de07bad6bd06 100644 --- a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll +++ b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll index 0b735b9106189..984f41cd1e9bf 100644 --- a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll +++ b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll index befca87972c19..96b822ad4aa86 100644 --- a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll +++ b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll @@ -1,9 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE ; TODO: We should delinearize the accesses despite the use in a call to a ; readonly function. For now we verify we do not delinearize them though. diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll index cceb5353d74c0..c04cc200e06bd 100644 --- a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll +++ b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll @@ -1,9 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE ; TODO: We should delinearize the accesses despite the use in a call to a ; readonly function. For now we verify we do not delinearize them though. diff --git a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll index c957dd10ed652..2abd37c9f82d0 100644 --- a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll +++ b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o]) { diff --git a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll index 4a1ee3b1af51d..47cbc0bb1c534 100644 --- a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll +++ b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; #define N 400 ; diff --git a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll index 9a6d8fbe12755..e82869616d63c 100644 --- a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll +++ b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Context: ; CHECK-NEXT: { : } diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim.ll b/polly/test/ScopInfo/multidim_fold_constant_dim.ll index 9f47694022868..dde847bb8d4d7 100644 --- a/polly/test/ScopInfo/multidim_fold_constant_dim.ll +++ b/polly/test/ScopInfo/multidim_fold_constant_dim.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; struct com { ; double Real; diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll index 5778126ad8f17..84222f73b7c6d 100644 --- a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll +++ b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts diff --git a/polly/test/ScopInfo/multidim_fortran_2d.ll b/polly/test/ScopInfo/multidim_fortran_2d.ll index e5b005f17dcc7..10314606a8123 100644 --- a/polly/test/ScopInfo/multidim_fortran_2d.ll +++ b/polly/test/ScopInfo/multidim_fortran_2d.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; subroutine init_array(ni, nj, pi, pj, a) ; implicit none diff --git a/polly/test/ScopInfo/multidim_fortran_2d_params.ll b/polly/test/ScopInfo/multidim_fortran_2d_params.ll index a7f7ebc130362..992df969f9cc2 100644 --- a/polly/test/ScopInfo/multidim_fortran_2d_params.ll +++ b/polly/test/ScopInfo/multidim_fortran_2d_params.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-precise-fold-accesses \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-precise-fold-accesses -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; subroutine init_array(ni, nj, pi, pj, a) ; implicit none diff --git a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll index 5f3080a12fdbe..79fd4c286745e 100644 --- a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll +++ b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll @@ -1,9 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE ; TODO: We should delinearize the accesses despite the use in a call to a ; readonly function. For now we verify we do not delinearize them though. diff --git a/polly/test/ScopInfo/multidim_fortran_srem.ll b/polly/test/ScopInfo/multidim_fortran_srem.ll index 31cc633fa65c6..62ff184f7a6b6 100644 --- a/polly/test/ScopInfo/multidim_fortran_srem.ll +++ b/polly/test/ScopInfo/multidim_fortran_srem.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; CHECK: Statements { diff --git a/polly/test/ScopInfo/multidim_gep_pointercast.ll b/polly/test/ScopInfo/multidim_gep_pointercast.ll index fd8048b11f14b..aa7932fb737f0 100644 --- a/polly/test/ScopInfo/multidim_gep_pointercast.ll +++ b/polly/test/ScopInfo/multidim_gep_pointercast.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; The load access to A has a pointer-bitcast to another elements size before the ; GetElementPtr. Verify that we do not the GEP delinearization because it diff --git a/polly/test/ScopInfo/multidim_gep_pointercast2.ll b/polly/test/ScopInfo/multidim_gep_pointercast2.ll index 9daae4b1ce3db..0475506fa9f1a 100644 --- a/polly/test/ScopInfo/multidim_gep_pointercast2.ll +++ b/polly/test/ScopInfo/multidim_gep_pointercast2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we do not use the GetElementPtr information to delinearize A ; because of the cast in-between. Use the single-dimensional modeling instead. diff --git a/polly/test/ScopInfo/multidim_invalid_dimension.ll b/polly/test/ScopInfo/multidim_invalid_dimension.ll index e1ec2e1ce3be0..1cf79f1bd8de1 100644 --- a/polly/test/ScopInfo/multidim_invalid_dimension.ll +++ b/polly/test/ScopInfo/multidim_invalid_dimension.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnueabi" diff --git a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll index 92b42a9e7a870..7779748c8c7f6 100644 --- a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll +++ b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o]) { diff --git a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll index 261cba1e68aad..49e0a9b60657b 100644 --- a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll +++ b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o], long p, long q, long r) { diff --git a/polly/test/ScopInfo/multidim_many_references.ll b/polly/test/ScopInfo/multidim_many_references.ll index f0f1c2b1f39db..a4edc9e725ac4 100644 --- a/polly/test/ScopInfo/multidim_many_references.ll +++ b/polly/test/ScopInfo/multidim_many_references.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/multidim_nested_start_integer.ll b/polly/test/ScopInfo/multidim_nested_start_integer.ll index 6ee9798a050d7..c98aece41a9e1 100644 --- a/polly/test/ScopInfo/multidim_nested_start_integer.ll +++ b/polly/test/ScopInfo/multidim_nested_start_integer.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o]) { diff --git a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll index e238bddf4783b..12c8d97f5d63b 100644 --- a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll +++ b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o]) { diff --git a/polly/test/ScopInfo/multidim_only_ivs_2d.ll b/polly/test/ScopInfo/multidim_only_ivs_2d.ll index 33b321716edc3..a9685d12eb178 100644 --- a/polly/test/ScopInfo/multidim_only_ivs_2d.ll +++ b/polly/test/ScopInfo/multidim_only_ivs_2d.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d.ll b/polly/test/ScopInfo/multidim_only_ivs_3d.ll index 39ea4243d9426..bb9c302eaf06a 100644 --- a/polly/test/ScopInfo/multidim_only_ivs_3d.ll +++ b/polly/test/ScopInfo/multidim_only_ivs_3d.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o]) { diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll index 7f7f7f91067e2..7f0c8b12be9ba 100644 --- a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll +++ b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void foo(int n, int m, int o, double A[n][m][o]) { ; diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll index 1675110ffd6f1..797a037a6770e 100644 --- a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll +++ b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; This test case checks for array access functions where the order in which the diff --git a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll index da9827fd5f2c6..3a21702b36727 100644 --- a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll +++ b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(long n, long m, float A[][n][m]) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/ScopInfo/multidim_param_in_subscript.ll b/polly/test/ScopInfo/multidim_param_in_subscript.ll index c86b5f0ae2386..cc3fa87c8ba04 100644 --- a/polly/test/ScopInfo/multidim_param_in_subscript.ll +++ b/polly/test/ScopInfo/multidim_param_in_subscript.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; ; void foo(long n, float A[][n]) { diff --git a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll index da563a05560cd..117671ddc6a22 100644 --- a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll +++ b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(float *A, long *p) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll index 7059e5396987b..5ebe0daaec470 100644 --- a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll +++ b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll @@ -1,11 +1,11 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/multidim_srem.ll b/polly/test/ScopInfo/multidim_srem.ll index 88c8c6af648e0..5c1b0ea7e6150 100644 --- a/polly/test/ScopInfo/multidim_srem.ll +++ b/polly/test/ScopInfo/multidim_srem.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(long n, float A[][n][n]) { ; for (long i = 0; i < 200; i++) diff --git a/polly/test/ScopInfo/multidim_with_bitcast.ll b/polly/test/ScopInfo/multidim_with_bitcast.ll index 0ab9c2d93ff46..941ec637dba3d 100644 --- a/polly/test/ScopInfo/multidim_with_bitcast.ll +++ b/polly/test/ScopInfo/multidim_with_bitcast.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/multiple-binary-or-conditions.ll b/polly/test/ScopInfo/multiple-binary-or-conditions.ll index 65416e6fffda3..ecfc0012fd59f 100644 --- a/polly/test/ScopInfo/multiple-binary-or-conditions.ll +++ b/polly/test/ScopInfo/multiple-binary-or-conditions.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s ; ; void or(float *A, long n, long m) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll index 910e624adb50a..9ae664fd497c8 100644 --- a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll +++ b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \ -; RUN: -polly-allow-differing-element-types \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s ; ; // For the following accesses the offset expression from the base pointer ; // is not always a multiple of the type size. diff --git a/polly/test/ScopInfo/multiple-types-non-affine-2.ll b/polly/test/ScopInfo/multiple-types-non-affine-2.ll index cb0630da1b2e6..6530dbf8d75be 100644 --- a/polly/test/ScopInfo/multiple-types-non-affine-2.ll +++ b/polly/test/ScopInfo/multiple-types-non-affine-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -polly-allow-nonaffine -disable-output ; ; // Check that accessing one array with different types works, ; // even though some accesses are non-affine. diff --git a/polly/test/ScopInfo/multiple-types-non-affine.ll b/polly/test/ScopInfo/multiple-types-non-affine.ll index 7349c5ae48ba2..7f5f995fd6d26 100644 --- a/polly/test/ScopInfo/multiple-types-non-affine.ll +++ b/polly/test/ScopInfo/multiple-types-non-affine.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -polly-allow-nonaffine -disable-output ; ; // Check that accessing one array with different types works, ; // even though some accesses are non-affine. diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll index df280c88f8668..5890a5a2ea3bf 100644 --- a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll +++ b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s ; ; void multiple_types(i8 *A) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll index b9494187d0ff3..3e8390aad300f 100644 --- a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll +++ b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s ; ; void multiple_types(i8 *A) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll index e971ccc0ba448..4e71f9b5dd66b 100644 --- a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll +++ b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \ -; RUN: -polly-allow-differing-element-types \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s ; ; ; void foo(long n, long m, char A[][m]) { diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional.ll b/polly/test/ScopInfo/multiple-types-two-dimensional.ll index 34179508cae89..9899fe4bde7ed 100644 --- a/polly/test/ScopInfo/multiple-types-two-dimensional.ll +++ b/polly/test/ScopInfo/multiple-types-two-dimensional.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \ -; RUN: -polly-allow-differing-element-types \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(long n, long m, char A[][m]) { ; for (long i = 0; i < n; i++) diff --git a/polly/test/ScopInfo/multiple-types.ll b/polly/test/ScopInfo/multiple-types.ll index 84d7d3349e29d..753386575d33a 100644 --- a/polly/test/ScopInfo/multiple-types.ll +++ b/polly/test/ScopInfo/multiple-types.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \ -; RUN: -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s ; ; // Check that accessing one array with different types works. ; void multiple_types(char *Short, char *Float, char *Double) { diff --git a/polly/test/ScopInfo/multiple_exiting_blocks.ll b/polly/test/ScopInfo/multiple_exiting_blocks.ll index b0c425ee62cc4..218e5c4108c90 100644 --- a/polly/test/ScopInfo/multiple_exiting_blocks.ll +++ b/polly/test/ScopInfo/multiple_exiting_blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll index ff0ec47be1c58..d3a70fdb96130 100644 --- a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll +++ b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/multiple_latch_blocks.ll b/polly/test/ScopInfo/multiple_latch_blocks.ll index e5085daa2ca16..0aa25f4ad70f6 100644 --- a/polly/test/ScopInfo/multiple_latch_blocks.ll +++ b/polly/test/ScopInfo/multiple_latch_blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Domain := ; CHECK: [N, P] -> { Stmt_if_end[i0] : 0 <= i0 < N and (i0 > P or i0 < P) }; diff --git a/polly/test/ScopInfo/nested-loops.ll b/polly/test/ScopInfo/nested-loops.ll index 91002979f4fa4..7998a3896d9d4 100644 --- a/polly/test/ScopInfo/nested-loops.ll +++ b/polly/test/ScopInfo/nested-loops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll index df010846bed20..f1ad40baf33ea 100644 --- a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll +++ b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not generate any scalar dependences regarding x. It is ; defined and used on the non-affine subregion only, thus we do not need diff --git a/polly/test/ScopInfo/non-affine-region-phi.ll b/polly/test/ScopInfo/non-affine-region-phi.ll index 3fb655e60f1c0..0248004c27f50 100644 --- a/polly/test/ScopInfo/non-affine-region-phi.ll +++ b/polly/test/ScopInfo/non-affine-region-phi.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -S < %s 2>&1 | FileCheck %s --check-prefix=CODE -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -S < %s 2>&1 | FileCheck %s --check-prefix=CODE +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Verify there is a phi in the non-affine region but it is not represented in ; the SCoP as all operands as well as the uses are inside the region too. diff --git a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll index 4c3ca4d21447d..158fe772c6d29 100644 --- a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll +++ b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops '-passes=print<polly-detect>,print<polly-function-scops>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Stmt_loop3 ; CHECK: Domain := diff --git a/polly/test/ScopInfo/non-affine-region-with-loop.ll b/polly/test/ScopInfo/non-affine-region-with-loop.ll index f4c028ac23409..bcb542f2cbf70 100644 --- a/polly/test/ScopInfo/non-affine-region-with-loop.ll +++ b/polly/test/ScopInfo/non-affine-region-with-loop.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -passes=polly-codegen -disable-output +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly<no-default-opts>' -disable-output ; ; CHECK: Domain := ; CHECK-NEXT: { Stmt_loop2__TO__loop[] }; diff --git a/polly/test/ScopInfo/non-precise-inv-load-1.ll b/polly/test/ScopInfo/non-precise-inv-load-1.ll index d55344b355f13..d100b514a0be3 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-1.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we do hoist the invariant access to I with a execution context ; as the address computation might wrap in the original but not in our diff --git a/polly/test/ScopInfo/non-precise-inv-load-2.ll b/polly/test/ScopInfo/non-precise-inv-load-2.ll index 79ef3b88cb4f0..fad8fcd918446 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-2.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; ; CHECK: Invariant Accesses: { diff --git a/polly/test/ScopInfo/non-precise-inv-load-3.ll b/polly/test/ScopInfo/non-precise-inv-load-3.ll index aa92847661165..d032644c9e5ff 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-3.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/non-precise-inv-load-4.ll b/polly/test/ScopInfo/non-precise-inv-load-4.ll index 2a2241cb5a993..c1ba7ddc62584 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-4.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we hoist I[0] without execution context even though it ; is executed in a statement with an invalid domain. diff --git a/polly/test/ScopInfo/non-precise-inv-load-5.ll b/polly/test/ScopInfo/non-precise-inv-load-5.ll index a414c7c0fed17..c188b5f74b1e9 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-5.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we do not hoist I[c] without execution context because it ; is executed in a statement with an invalid domain and it depends diff --git a/polly/test/ScopInfo/non-precise-inv-load-6.ll b/polly/test/ScopInfo/non-precise-inv-load-6.ll index 1300617f00eeb..b1c19745f1424 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-6.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-6.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we model the execution context correctly. ; diff --git a/polly/test/ScopInfo/non-pure-function-call.ll b/polly/test/ScopInfo/non-pure-function-call.ll index 81d43db5c3522..ad69141a12c66 100644 --- a/polly/test/ScopInfo/non-pure-function-call.ll +++ b/polly/test/ScopInfo/non-pure-function-call.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll index 6cbb41041be88..38e1c03a35227 100644 --- a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll +++ b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Error blocks are skipped during SCoP detection. We skip them during ; SCoP formation too as they might contain instructions we can not handle. diff --git a/polly/test/ScopInfo/non-pure-function-calls.ll b/polly/test/ScopInfo/non-pure-function-calls.ll index f97644052272d..d45c32ede7088 100644 --- a/polly/test/ScopInfo/non-pure-function-calls.ll +++ b/polly/test/ScopInfo/non-pure-function-calls.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Allow the user to define function names that are treated as ; error functions and assumed not to be executed. diff --git a/polly/test/ScopInfo/non_affine_access.ll b/polly/test/ScopInfo/non_affine_access.ll index 0338edf053297..0f5d9e7c43e4e 100644 --- a/polly/test/ScopInfo/non_affine_access.ll +++ b/polly/test/ScopInfo/non_affine_access.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long *A) { diff --git a/polly/test/ScopInfo/non_affine_region_1.ll b/polly/test/ScopInfo/non_affine_region_1.ll index 8980a711b325d..5934962f81567 100644 --- a/polly/test/ScopInfo/non_affine_region_1.ll +++ b/polly/test/ScopInfo/non_affine_region_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Verify only the incoming scalar x is modeled as a read in the non-affine ; region. diff --git a/polly/test/ScopInfo/non_affine_region_2.ll b/polly/test/ScopInfo/non_affine_region_2.ll index b2e072f7a3bfa..aa083616cac8e 100644 --- a/polly/test/ScopInfo/non_affine_region_2.ll +++ b/polly/test/ScopInfo/non_affine_region_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Verify the scalar x defined in a non-affine subregion is written as it ; escapes the region. In this test the two conditionals inside the region diff --git a/polly/test/ScopInfo/non_affine_region_3.ll b/polly/test/ScopInfo/non_affine_region_3.ll index d850cb5c95aad..b7c4c1b9bd545 100644 --- a/polly/test/ScopInfo/non_affine_region_3.ll +++ b/polly/test/ScopInfo/non_affine_region_3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Verify the scalar x defined in a non-affine subregion is written as it ; escapes the region. In this test the two conditionals inside the region diff --git a/polly/test/ScopInfo/non_affine_region_4.ll b/polly/test/ScopInfo/non_affine_region_4.ll index c5309734a668e..12cda0a53fb3b 100644 --- a/polly/test/ScopInfo/non_affine_region_4.ll +++ b/polly/test/ScopInfo/non_affine_region_4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that both scalars (x and y) are properly written in the non-affine ; region and read afterwards. diff --git a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll index b1ce00f0df94e..a52aae0d59168 100644 --- a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll +++ b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Domain := ; CHECK-NEXT: { Stmt_while_cond_i__TO__while_end_i[] }; diff --git a/polly/test/ScopInfo/not-a-reduction.ll b/polly/test/ScopInfo/not-a-reduction.ll index 3a961b2dc1719..84f6564ae4a2e 100644 --- a/polly/test/ScopInfo/not-a-reduction.ll +++ b/polly/test/ScopInfo/not-a-reduction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | not FileCheck %s ;#define TYPE float ;#define NUM 4 diff --git a/polly/test/ScopInfo/opaque-struct.ll b/polly/test/ScopInfo/opaque-struct.ll index f4f79525069e5..23b9d3caf741d 100644 --- a/polly/test/ScopInfo/opaque-struct.ll +++ b/polly/test/ScopInfo/opaque-struct.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; ; Check that we do not crash with unsized (opaque) types. ; diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll index eed27b1c4d9dd..e069ccac55340 100644 --- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll +++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s ; ; Check whether %newval is identified as escaping value, even though it is used ; in a phi that is in the region. Non-affine subregion case. diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll index 44da399e704d8..27ea11a23a3fe 100644 --- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll +++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1] ; CHECK-NEXT: [p_0] -> { Stmt_bb3[] -> MemRef_tmp5[] }; diff --git a/polly/test/ScopInfo/parameter-constant-division.ll b/polly/test/ScopInfo/parameter-constant-division.ll index e5dd359158b8b..aaad0dfb2ee60 100644 --- a/polly/test/ScopInfo/parameter-constant-division.ll +++ b/polly/test/ScopInfo/parameter-constant-division.ll @@ -1,6 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/parameter_in_dead_statement.ll b/polly/test/ScopInfo/parameter_in_dead_statement.ll index b295f17f628af..444f9a9c24b4e 100644 --- a/polly/test/ScopInfo/parameter_in_dead_statement.ll +++ b/polly/test/ScopInfo/parameter_in_dead_statement.ll @@ -1,7 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR ; ; Verify we do not create assumptions based on the parameter p_1 which is the ; load %0 and due to error-assumptions not "part of the SCoP". diff --git a/polly/test/ScopInfo/parameter_product.ll b/polly/test/ScopInfo/parameter_product.ll index 2fe16f9d95f6d..9e6e3d0e1446e 100644 --- a/polly/test/ScopInfo/parameter_product.ll +++ b/polly/test/ScopInfo/parameter_product.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; int n, m; ; void foo(char* __restrict a) diff --git a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll index 6544aaec76f74..20986d17b8f0d 100644 --- a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll +++ b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the access function of the store is simple and concise ; diff --git a/polly/test/ScopInfo/partially_invariant_load_1.ll b/polly/test/ScopInfo/partially_invariant_load_1.ll index f3923f6127cdd..8d62f156a4394 100644 --- a/polly/test/ScopInfo/partially_invariant_load_1.ll +++ b/polly/test/ScopInfo/partially_invariant_load_1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=IR ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/partially_invariant_load_2.ll b/polly/test/ScopInfo/partially_invariant_load_2.ll index d0d74ad99e09b..48580907b2f0b 100644 --- a/polly/test/ScopInfo/partially_invariant_load_2.ll +++ b/polly/test/ScopInfo/partially_invariant_load_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not try to preload *I and assume p != 42. ; diff --git a/polly/test/ScopInfo/phi-in-non-affine-region.ll b/polly/test/ScopInfo/phi-in-non-affine-region.ll index fbbc158b566bb..6d98a6813862e 100644 --- a/polly/test/ScopInfo/phi-in-non-affine-region.ll +++ b/polly/test/ScopInfo/phi-in-non-affine-region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; Verify that 'tmp' is stored in bb1 and read by bb3, as it is needed as ; incoming value for the tmp11 PHI node. diff --git a/polly/test/ScopInfo/phi_after_error_block.ll b/polly/test/ScopInfo/phi_after_error_block.ll index a1eadff3e9717..251be099c1f49 100644 --- a/polly/test/ScopInfo/phi_after_error_block.ll +++ b/polly/test/ScopInfo/phi_after_error_block.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s declare void @bar() diff --git a/polly/test/ScopInfo/phi_condition_modeling_1.ll b/polly/test/ScopInfo/phi_condition_modeling_1.ll index a889ec96a4b12..bd5c51e968ff5 100644 --- a/polly/test/ScopInfo/phi_condition_modeling_1.ll +++ b/polly/test/ScopInfo/phi_condition_modeling_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int c, int N) { ; int tmp; diff --git a/polly/test/ScopInfo/phi_condition_modeling_2.ll b/polly/test/ScopInfo/phi_condition_modeling_2.ll index b56b77e1f4534..281b8d33b7756 100644 --- a/polly/test/ScopInfo/phi_condition_modeling_2.ll +++ b/polly/test/ScopInfo/phi_condition_modeling_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int c, int N) { ; int tmp; diff --git a/polly/test/ScopInfo/phi_conditional_simple_1.ll b/polly/test/ScopInfo/phi_conditional_simple_1.ll index 14fdc38201bc8..6d7f0e9484113 100644 --- a/polly/test/ScopInfo/phi_conditional_simple_1.ll +++ b/polly/test/ScopInfo/phi_conditional_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void jd(int *A, int c) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/ScopInfo/phi_loop_carried_float.ll b/polly/test/ScopInfo/phi_loop_carried_float.ll index 76e5507f24b06..2e62dcd5799a3 100644 --- a/polly/test/ScopInfo/phi_loop_carried_float.ll +++ b/polly/test/ScopInfo/phi_loop_carried_float.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; float f(float *A, int N) { ; float tmp = 0; diff --git a/polly/test/ScopInfo/phi_not_grouped_at_top.ll b/polly/test/ScopInfo/phi_not_grouped_at_top.ll index c97d9a27b24b7..57d02f24f781b 100644 --- a/polly/test/ScopInfo/phi_not_grouped_at_top.ll +++ b/polly/test/ScopInfo/phi_not_grouped_at_top.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-prepare -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare>' -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" declare i32 @funa() align 2 diff --git a/polly/test/ScopInfo/phi_scalar_simple_1.ll b/polly/test/ScopInfo/phi_scalar_simple_1.ll index ffd1a37f8a79f..600c94e1d9b4c 100644 --- a/polly/test/ScopInfo/phi_scalar_simple_1.ll +++ b/polly/test/ScopInfo/phi_scalar_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; The assumed context should be empty since the <nsw> flags on the IV ; increments already guarantee that there is no wrap in the loop trip diff --git a/polly/test/ScopInfo/phi_scalar_simple_2.ll b/polly/test/ScopInfo/phi_scalar_simple_2.ll index 0d6d9029c61c3..d3353ddc5e4e8 100644 --- a/polly/test/ScopInfo/phi_scalar_simple_2.ll +++ b/polly/test/ScopInfo/phi_scalar_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; int jd(int *restrict A, int x, int N, int c) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/phi_with_invoke_edge.ll b/polly/test/ScopInfo/phi_with_invoke_edge.ll index 9c98ec0c603cf..1b01a98fca06a 100644 --- a/polly/test/ScopInfo/phi_with_invoke_edge.ll +++ b/polly/test/ScopInfo/phi_with_invoke_edge.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" declare i32 @generic_personality_v0(i32, i64, ptr, ptr) diff --git a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll index 18ba18c69f1f9..1b983ace1b6a4 100644 --- a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll +++ b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int *B) { ; while (A != B) { diff --git a/polly/test/ScopInfo/pointer-comparison.ll b/polly/test/ScopInfo/pointer-comparison.ll index 846640ac630ff..f80c4978669c4 100644 --- a/polly/test/ScopInfo/pointer-comparison.ll +++ b/polly/test/ScopInfo/pointer-comparison.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; TODO: FIXME: Investigate why we need a InvalidContext here. ; diff --git a/polly/test/ScopInfo/pointer-type-expressions.ll b/polly/test/ScopInfo/pointer-type-expressions.ll index 89dce6536a107..0fdd0bea6f219 100644 --- a/polly/test/ScopInfo/pointer-type-expressions.ll +++ b/polly/test/ScopInfo/pointer-type-expressions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], int N, float *P) { ; int i; diff --git a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll index 7b6d0d542581b..8ad531d93d290 100644 --- a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll +++ b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; In this test case we pass a pointer %A into a PHI node and also use this ; pointer as base pointer of an array store. As a result, we get both scalar diff --git a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll index 13087a517501a..7dfa1ec7905ba 100644 --- a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll +++ b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Statements { ; CHECK-NEXT: Stmt_bb9 diff --git a/polly/test/ScopInfo/pr38218.ll b/polly/test/ScopInfo/pr38218.ll index 74103f9a2ac38..2c22b1464876d 100644 --- a/polly/test/ScopInfo/pr38218.ll +++ b/polly/test/ScopInfo/pr38218.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s ; ; This code causes the SCoP to be rejected because of an ERRORBLOCK ; assumption and made Polly crash (llvm.org/PR38219). diff --git a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll index 33fa0126aa30e..800b0339a1422 100644 --- a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll +++ b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/process_added_dimensions.ll b/polly/test/ScopInfo/process_added_dimensions.ll index 2d06f4b995976..9cb932eeef18a 100644 --- a/polly/test/ScopInfo/process_added_dimensions.ll +++ b/polly/test/ScopInfo/process_added_dimensions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Context: ; CHECK-NEXT: { : } diff --git a/polly/test/ScopInfo/pwaff-complexity-bailout.ll b/polly/test/ScopInfo/pwaff-complexity-bailout.ll index 931e08fb8f2fc..62909f8c3e4c5 100644 --- a/polly/test/ScopInfo/pwaff-complexity-bailout.ll +++ b/polly/test/ScopInfo/pwaff-complexity-bailout.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis=.* -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops '-pass-remarks-analysis=.*' -disable-output < %s 2>&1 | FileCheck %s ; Make sure we hit the complexity bailout, and don't crash. ; CHECK: Low complexity assumption: { : false } diff --git a/polly/test/ScopInfo/ranged_parameter.ll b/polly/test/ScopInfo/ranged_parameter.ll index 03562b1fd1245..a6e51c7f2048c 100644 --- a/polly/test/ScopInfo/ranged_parameter.ll +++ b/polly/test/ScopInfo/ranged_parameter.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the constraints on the parameter derived from the ; range metadata (see bottom of the file) are present: diff --git a/polly/test/ScopInfo/ranged_parameter_2.ll b/polly/test/ScopInfo/ranged_parameter_2.ll index 18cbbf3b87cd6..554dd6e38cd00 100644 --- a/polly/test/ScopInfo/ranged_parameter_2.ll +++ b/polly/test/ScopInfo/ranged_parameter_2.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true < %s \ -; RUN: -debug 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true -debug < %s 2>&1 | FileCheck %s ; REQUIRES: asserts diff --git a/polly/test/ScopInfo/ranged_parameter_wrap.ll b/polly/test/ScopInfo/ranged_parameter_wrap.ll index d236eeeefc11c..7ae15c34c94c6 100644 --- a/polly/test/ScopInfo/ranged_parameter_wrap.ll +++ b/polly/test/ScopInfo/ranged_parameter_wrap.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the constraints on the parameter derived from the ; __wrapping__ range metadata (see bottom of the file) are present: diff --git a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll index fc0a737a5edbe..00c3caa9c50ce 100644 --- a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll +++ b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the context is built fast and does not explode due to us ; combining a large number of non-convex ranges. Instead, after a certain diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll index 7e6f2406a0ac8..528dbb102ecb0 100644 --- a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll +++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; float foo(float sum, float A[]) { ; diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll index 18e6c1fac9e15..6bc1fe71f35f2 100644 --- a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll +++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; float foo(float sum, float A[]) { ; diff --git a/polly/test/ScopInfo/read-only-scalars.ll b/polly/test/ScopInfo/read-only-scalars.ll index f04163e480284..7c78d621930c5 100644 --- a/polly/test/ScopInfo/read-only-scalars.ll +++ b/polly/test/ScopInfo/read-only-scalars.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALARS +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALARS ; CHECK-NOT: Memref_scalar diff --git a/polly/test/ScopInfo/read-only-statements.ll b/polly/test/ScopInfo/read-only-statements.ll index 7bac53a2b6b51..c1cb618a45f64 100644 --- a/polly/test/ScopInfo/read-only-statements.ll +++ b/polly/test/ScopInfo/read-only-statements.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check we remove read only statements. ; diff --git a/polly/test/ScopInfo/reduction_alternating_base.ll b/polly/test/ScopInfo/reduction_alternating_base.ll index e38ff6046ac01..474c6ac64ffc1 100644 --- a/polly/test/ScopInfo/reduction_alternating_base.ll +++ b/polly/test/ScopInfo/reduction_alternating_base.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; ; void f(int *A) { diff --git a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll index 17f9dc57f2823..e91eeaf544a05 100644 --- a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll +++ b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: NONE ; diff --git a/polly/test/ScopInfo/reduction_different_index.ll b/polly/test/ScopInfo/reduction_different_index.ll index d2786d5fd6779..5c169f71f4fe8 100644 --- a/polly/test/ScopInfo/reduction_different_index.ll +++ b/polly/test/ScopInfo/reduction_different_index.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; Verify if the following case is not detected as reduction. ; ; void f(int *A,int *sum) { diff --git a/polly/test/ScopInfo/reduction_different_index1.ll b/polly/test/ScopInfo/reduction_different_index1.ll index 710ae3e74f21a..93ab77be84de9 100644 --- a/polly/test/ScopInfo/reduction_different_index1.ll +++ b/polly/test/ScopInfo/reduction_different_index1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; Verify if the following case is not detected as reduction. ; ; void f(int *A, int *sum, int i1, int i2) { diff --git a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll index 61228e075dabe..618e4d3ab3f98 100644 --- a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll +++ b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll @@ -1,4 +1,4 @@ -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-disable-multiplicative-reductions -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-disable-multiplicative-reductions -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: ReadAccess := [Reduction Type: + ; CHECK: { Stmt_for_body[i0] -> MemRef_sum[0] }; diff --git a/polly/test/ScopInfo/reduction_double.ll b/polly/test/ScopInfo/reduction_double.ll index d126d3d833ee1..a7721d1b42e46 100644 --- a/polly/test/ScopInfo/reduction_double.ll +++ b/polly/test/ScopInfo/reduction_double.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s ; ; Verify if two independent reductions in same loop is detected ; diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate.ll b/polly/test/ScopInfo/reduction_escaping_intermediate.ll index c66a8be0852fa..86923458ee773 100644 --- a/polly/test/ScopInfo/reduction_escaping_intermediate.ll +++ b/polly/test/ScopInfo/reduction_escaping_intermediate.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int N, int * restrict sums, int * restrict escape) { ; int i, j; diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll index c574d315b2fe1..641d2e7337e77 100644 --- a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll +++ b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int N, int * restrict sums, int * restrict escape) { ; int i, j; diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll index 92a071ea1c372..dd2a76ebbd368 100644 --- a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll +++ b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s ; ; void f(int N, int * restrict sums, int * restrict escape) { ; int i, j; diff --git a/polly/test/ScopInfo/reduction_if.ll b/polly/test/ScopInfo/reduction_if.ll index 4f7d3681e0a0b..53a62a3b857e9 100644 --- a/polly/test/ScopInfo/reduction_if.ll +++ b/polly/test/ScopInfo/reduction_if.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s ; ; Verify if reduction spread across multiple blocks in a single scop statement are detected ; diff --git a/polly/test/ScopInfo/reduction_indirect_access.ll b/polly/test/ScopInfo/reduction_indirect_access.ll index 7acac4b150f40..cb54cd9581368 100644 --- a/polly/test/ScopInfo/reduction_indirect_access.ll +++ b/polly/test/ScopInfo/reduction_indirect_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s ; ; CHECK: Reduction Type: NONE ; CHECK: MemRef_INDICES[i0] diff --git a/polly/test/ScopInfo/reduction_indirect_access_2.ll b/polly/test/ScopInfo/reduction_indirect_access_2.ll index 331953991d86c..5642a8470f124 100644 --- a/polly/test/ScopInfo/reduction_indirect_access_2.ll +++ b/polly/test/ScopInfo/reduction_indirect_access_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s ; ; Validate that the accesses to INDICES[i] is not part of a reduction. ; diff --git a/polly/test/ScopInfo/reduction_invalid_different_operators.ll b/polly/test/ScopInfo/reduction_invalid_different_operators.ll index 9846f1029c087..9e6b3cd431083 100644 --- a/polly/test/ScopInfo/reduction_invalid_different_operators.ll +++ b/polly/test/ScopInfo/reduction_invalid_different_operators.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; int f() { ; int i, sum = 0, sth = 0; diff --git a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll index 4d70e53304556..7ae7d8ed3ffa2 100644 --- a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll +++ b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *sums) { ; int i, j; diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain.ll b/polly/test/ScopInfo/reduction_long_reduction_chain.ll index 62ae1fef187b6..6f2f48005bdac 100644 --- a/polly/test/ScopInfo/reduction_long_reduction_chain.ll +++ b/polly/test/ScopInfo/reduction_long_reduction_chain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s ; ; CHECK: Reduction Type: + ; CHECK: MemRef_sum diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll index 7ca46fa9535ac..2fd71c28d5211 100644 --- a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll +++ b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s ; ; Sum is added twice in the statement. Hence no reduction. ; CHECK: Reduction Type: NONE diff --git a/polly/test/ScopInfo/reduction_multiple_different_operators.ll b/polly/test/ScopInfo/reduction_multiple_different_operators.ll index b77c72a291744..4f049a3505b09 100644 --- a/polly/test/ScopInfo/reduction_multiple_different_operators.ll +++ b/polly/test/ScopInfo/reduction_multiple_different_operators.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s ; ; Should not be identified as reduction as there are different operations ; involved on sum (multiplication followed by addition) diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll index 800eb2043dc62..0d016674ffc08 100644 --- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll +++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll @@ -1,4 +1,4 @@ -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Stmt_for_body ; CHECK: Reduction Type: * diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll index 49ebdcb044988..568513aedfa10 100644 --- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll +++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll @@ -1,4 +1,4 @@ -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Stmt_for_body ; CHECK: Reduction Type: NONE diff --git a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll index 77b71f4df301b..0ac50b3b92c47 100644 --- a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll +++ b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll @@ -1,4 +1,4 @@ -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: ReadAccess := [Reduction Type: NONE ; CHECK: { Stmt_for_body[i0] -> MemRef_A[1 + i0] }; diff --git a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll index 61aaa051e49d1..f01b641b17f64 100644 --- a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll +++ b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: + ; CHECK: Reduction Type: + diff --git a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll index fb6d236764b74..51685dca8b7da 100644 --- a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll +++ b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: + ; diff --git a/polly/test/ScopInfo/reduction_simple_fp.ll b/polly/test/ScopInfo/reduction_simple_fp.ll index aa4cd00f39f59..67139bba2fded 100644 --- a/polly/test/ScopInfo/reduction_simple_fp.ll +++ b/polly/test/ScopInfo/reduction_simple_fp.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Function: f_no_fast_math ; CHECK: Reduction Type: NONE diff --git a/polly/test/ScopInfo/reduction_simple_w_constant.ll b/polly/test/ScopInfo/reduction_simple_w_constant.ll index e385b66f9db21..c17184624c066 100644 --- a/polly/test/ScopInfo/reduction_simple_w_constant.ll +++ b/polly/test/ScopInfo/reduction_simple_w_constant.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: + ; diff --git a/polly/test/ScopInfo/reduction_simple_w_iv.ll b/polly/test/ScopInfo/reduction_simple_w_iv.ll index e22eccbb2831d..7cc50bfe78906 100644 --- a/polly/test/ScopInfo/reduction_simple_w_iv.ll +++ b/polly/test/ScopInfo/reduction_simple_w_iv.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: + ; diff --git a/polly/test/ScopInfo/reduction_two_identical_reads.ll b/polly/test/ScopInfo/reduction_two_identical_reads.ll index 8f00954f7efc3..35cb9dfcdb122 100644 --- a/polly/test/ScopInfo/reduction_two_identical_reads.ll +++ b/polly/test/ScopInfo/reduction_two_identical_reads.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: NONE ; diff --git a/polly/test/ScopInfo/redundant_parameter_constraint.ll b/polly/test/ScopInfo/redundant_parameter_constraint.ll index ad71f1f59e18b..7512da420af0e 100644 --- a/polly/test/ScopInfo/redundant_parameter_constraint.ll +++ b/polly/test/ScopInfo/redundant_parameter_constraint.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; The constraint that r2 has to be bigger than r1 is implicitly contained in ; the domain, hence we do not want to see it explicitly. diff --git a/polly/test/ScopInfo/region-with-instructions.ll b/polly/test/ScopInfo/region-with-instructions.ll index d4720511b7aad..38d58c97e1b05 100644 --- a/polly/test/ScopInfo/region-with-instructions.ll +++ b/polly/test/ScopInfo/region-with-instructions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Statements { ; CHECK: Stmt_bb46 diff --git a/polly/test/ScopInfo/remarks.ll b/polly/test/ScopInfo/remarks.ll index 10cc57aa27a14..2d6ace988659d 100644 --- a/polly/test/ScopInfo/remarks.ll +++ b/polly/test/ScopInfo/remarks.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ -; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: remark: test/ScopInfo/remarks.c:4:7: SCoP begins here. ; CHECK: remark: test/ScopInfo/remarks.c:9:15: Inbounds assumption: [N, M, Debug] -> { : M <= 100 } diff --git a/polly/test/ScopInfo/required-invariant-loop-bounds.ll b/polly/test/ScopInfo/required-invariant-loop-bounds.ll index abf0b0e23855c..3bb5bfb0765e3 100644 --- a/polly/test/ScopInfo/required-invariant-loop-bounds.ll +++ b/polly/test/ScopInfo/required-invariant-loop-bounds.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ -; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/restriction_in_dead_block.ll b/polly/test/ScopInfo/restriction_in_dead_block.ll index 487c585cb9d9c..dd6115c421d0c 100644 --- a/polly/test/ScopInfo/restriction_in_dead_block.ll +++ b/polly/test/ScopInfo/restriction_in_dead_block.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we do not generate an empty invalid context only because the wrap ; in the second conditional will always happen if the block is executed. diff --git a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll index 702b7dc5e0049..e8df1eccd5945 100644 --- a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll +++ b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll @@ -1,6 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s -check-prefix=DETECT -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; DETECT: Valid Region for Scop: bb124 => bb176 ; diff --git a/polly/test/ScopInfo/run-time-check-many-parameters.ll b/polly/test/ScopInfo/run-time-check-many-parameters.ll index 559c38d2682ef..2a8853322f1d5 100644 --- a/polly/test/ScopInfo/run-time-check-many-parameters.ll +++ b/polly/test/ScopInfo/run-time-check-many-parameters.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; A valid Scop would print the list of it's statements, we check that we do not ; see that list. diff --git a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll index 3cf4c40bdb60f..5e71e7a9d2a46 100644 --- a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll +++ b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll @@ -1,6 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \ -; RUN: | FileCheck %s -check-prefix=DETECT -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; DETECT: Valid Region for Scop: for => return ; diff --git a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll index 51ab81476d542..286f878f935f4 100644 --- a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll +++ b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(float *A, float *B, float *C, long N) { ; for (long i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll index dd809ba156c79..9f4d6f5895aeb 100644 --- a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll +++ b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we introduce two ScopArrayInfo objects (or virtual arrays) for the %out variable ; as it is used as a memory base pointer (%0) but also as a scalar (%out.addr.0.lcssa). diff --git a/polly/test/ScopInfo/scalar.ll b/polly/test/ScopInfo/scalar.ll index 812d2fddc3c8e..db8371d96b118 100644 --- a/polly/test/ScopInfo/scalar.ll +++ b/polly/test/ScopInfo/scalar.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/polly/test/ScopInfo/scalar_dependence_cond_br.ll b/polly/test/ScopInfo/scalar_dependence_cond_br.ll index 59549f3dbbad5..a09bdaf06844e 100644 --- a/polly/test/ScopInfo/scalar_dependence_cond_br.ll +++ b/polly/test/ScopInfo/scalar_dependence_cond_br.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int c, int d) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/ScopInfo/scalar_to_array.ll b/polly/test/ScopInfo/scalar_to_array.ll index 3f61d0d723046..e71c515fa2d35 100644 --- a/polly/test/ScopInfo/scalar_to_array.ll +++ b/polly/test/ScopInfo/scalar_to_array.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ModuleID = 'scalar_to_array.ll' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll index fa0c81fe9a48e..66c50dcbe13f3 100644 --- a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll +++ b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; Derived from test-suite/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c diff --git a/polly/test/ScopInfo/scev-invalidated.ll b/polly/test/ScopInfo/scev-invalidated.ll index 6b9efd4b37c7d..e0956df0b1e84 100644 --- a/polly/test/ScopInfo/scev-invalidated.ll +++ b/polly/test/ScopInfo/scev-invalidated.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Region: %if.then6---%return ; diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll index 6e2ed1240b071..4a280cc929e3a 100644 --- a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll +++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll index d0e8a2accaa2c..777c0088c4ddd 100644 --- a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll +++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll index 9ffc30f7360e9..15dea5a7f4dd8 100644 --- a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll +++ b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not build a SCoP and do not crash. ; diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll index 65f2f99b48c1b..9ac6643564f7b 100644 --- a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll +++ b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not build a SCoP and do not crash. ; diff --git a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll index 7c36f8d7f72e8..1657d2f37d8ba 100644 --- a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll +++ b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s ; ; This test contains a infinite loop (bb13) and crashed the domain generation ; at some point. Just verify it does not anymore. diff --git a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll index c8a234e9cbce7..76bb438d43ff7 100644 --- a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll +++ b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128" target triple = "aarch64--linux-android" diff --git a/polly/test/ScopInfo/sign_wrapped_set.ll b/polly/test/ScopInfo/sign_wrapped_set.ll index 93b63df1c5841..135976e7d51c6 100644 --- a/polly/test/ScopInfo/sign_wrapped_set.ll +++ b/polly/test/ScopInfo/sign_wrapped_set.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Domain := ; CHECK-NEXT: [srcHeight] -> { Stmt_for_cond6_preheader_us[i0] : 0 <= i0 <= -3 + srcHeight }; diff --git a/polly/test/ScopInfo/simple_loop_1.ll b/polly/test/ScopInfo/simple_loop_1.ll index e736f3382d905..1d9f5c2edebcb 100644 --- a/polly/test/ScopInfo/simple_loop_1.ll +++ b/polly/test/ScopInfo/simple_loop_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], int N) { ; int i; diff --git a/polly/test/ScopInfo/simple_loop_2.ll b/polly/test/ScopInfo/simple_loop_2.ll index ae83dd633b96e..877f860ba5a90 100644 --- a/polly/test/ScopInfo/simple_loop_2.ll +++ b/polly/test/ScopInfo/simple_loop_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], int N) { ; int i; diff --git a/polly/test/ScopInfo/simple_loop_unsigned.ll b/polly/test/ScopInfo/simple_loop_unsigned.ll index c4a96e4381c94..d3834297e2668 100644 --- a/polly/test/ScopInfo/simple_loop_unsigned.ll +++ b/polly/test/ScopInfo/simple_loop_unsigned.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], unsigned N) { ; unsigned i; diff --git a/polly/test/ScopInfo/simple_loop_unsigned_2.ll b/polly/test/ScopInfo/simple_loop_unsigned_2.ll index 37e907dc006f3..1da6053a8316b 100644 --- a/polly/test/ScopInfo/simple_loop_unsigned_2.ll +++ b/polly/test/ScopInfo/simple_loop_unsigned_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/simple_loop_unsigned_3.ll b/polly/test/ScopInfo/simple_loop_unsigned_3.ll index 7f2cf5caa1ce7..0d44bf64ffc18 100644 --- a/polly/test/ScopInfo/simple_loop_unsigned_3.ll +++ b/polly/test/ScopInfo/simple_loop_unsigned_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll index 4df0d343b0fc9..f70b3fa3ea21a 100644 --- a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll +++ b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | not FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @.str = private unnamed_addr constant [17 x i8] c"Random Value: %d\00", align 1 diff --git a/polly/test/ScopInfo/smax.ll b/polly/test/ScopInfo/smax.ll index 8968e13192477..3ba2b35e7e503 100644 --- a/polly/test/ScopInfo/smax.ll +++ b/polly/test/ScopInfo/smax.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32-S64" define void @foo(ptr noalias %data, ptr noalias %ptr, i32 %x_pos, i32 %w) { diff --git a/polly/test/ScopInfo/statistics.ll b/polly/test/ScopInfo/statistics.ll index 0a294f2016eba..aa72db3065259 100644 --- a/polly/test/ScopInfo/statistics.ll +++ b/polly/test/ScopInfo/statistics.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -stats -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -stats -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; CHECK-DAG: 4 polly-scops - Maximal number of loops in scops diff --git a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll index a46acb090b7fd..54832607f11d5 100644 --- a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll +++ b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Region__TO__Stmt diff --git a/polly/test/ScopInfo/stmt_split_no_after_split.ll b/polly/test/ScopInfo/stmt_split_no_after_split.ll index 3a5ebf0725b10..0a4284bdd34f5 100644 --- a/polly/test/ScopInfo/stmt_split_no_after_split.ll +++ b/polly/test/ScopInfo/stmt_split_no_after_split.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_split_no_dependence.ll b/polly/test/ScopInfo/stmt_split_no_dependence.ll index 9edd0f0a13e59..ed2180407c68d 100644 --- a/polly/test/ScopInfo/stmt_split_no_dependence.ll +++ b/polly/test/ScopInfo/stmt_split_no_dependence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void func(int *A, int *B){ ; for (int i = 0; i < 1024; i+=1) { diff --git a/polly/test/ScopInfo/stmt_split_on_store.ll b/polly/test/ScopInfo/stmt_split_on_store.ll index d645becb19583..f35a07c8d7176 100644 --- a/polly/test/ScopInfo/stmt_split_on_store.ll +++ b/polly/test/ScopInfo/stmt_split_on_store.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=store -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=store -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void func(int *A, int *B){ ; for (int i = 0; i < 1024; i+=1) { diff --git a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll index 1a1ccff4f02d6..41721867f1764 100644 --- a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll +++ b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll index 594b36279d6bc..0521525e272b3 100644 --- a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll +++ b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll index 6c9f1c2cb5fd0..82a85aa5f0099 100644 --- a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll +++ b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll index 07abe46ac0399..1f21c0ce7225f 100644 --- a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll +++ b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_split_within_loop.ll b/polly/test/ScopInfo/stmt_split_within_loop.ll index 9a42ae3a37270..580ffab567846 100644 --- a/polly/test/ScopInfo/stmt_split_within_loop.ll +++ b/polly/test/ScopInfo/stmt_split_within_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll index ba4801d9a0006..67e8f631312ea 100644 --- a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll +++ b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; The statement Stmt_for_if_else_1 should be removed because it has no ; sideeffects. But it has a use of MemRef_tmp21 that must also be diff --git a/polly/test/ScopInfo/switch-1.ll b/polly/test/ScopInfo/switch-1.ll index 0c3610185e6e0..0f9e83210661b 100644 --- a/polly/test/ScopInfo/switch-1.ll +++ b/polly/test/ScopInfo/switch-1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/switch-2.ll b/polly/test/ScopInfo/switch-2.ll index f0056da37955d..9defd41f25231 100644 --- a/polly/test/ScopInfo/switch-2.ll +++ b/polly/test/ScopInfo/switch-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/switch-3.ll b/polly/test/ScopInfo/switch-3.ll index a1810bf6ef538..faaa4d0254db9 100644 --- a/polly/test/ScopInfo/switch-3.ll +++ b/polly/test/ScopInfo/switch-3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/switch-4.ll b/polly/test/ScopInfo/switch-4.ll index 00665fd75cbcd..c82e703a82965 100644 --- a/polly/test/ScopInfo/switch-4.ll +++ b/polly/test/ScopInfo/switch-4.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/switch-5.ll b/polly/test/ScopInfo/switch-5.ll index 2de3695649404..5a49be8d80975 100644 --- a/polly/test/ScopInfo/switch-5.ll +++ b/polly/test/ScopInfo/switch-5.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/switch-6.ll b/polly/test/ScopInfo/switch-6.ll index b859840ee111f..379981b167039 100644 --- a/polly/test/ScopInfo/switch-6.ll +++ b/polly/test/ScopInfo/switch-6.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) { diff --git a/polly/test/ScopInfo/switch-7.ll b/polly/test/ScopInfo/switch-7.ll index f73d97f70b28d..0c8efc590b9c9 100644 --- a/polly/test/ScopInfo/switch-7.ll +++ b/polly/test/ScopInfo/switch-7.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int c, int N) { ; switch (c) { diff --git a/polly/test/ScopInfo/tempscop-printing.ll b/polly/test/ScopInfo/tempscop-printing.ll index 4f02176569b73..09cc95e42a584 100644 --- a/polly/test/ScopInfo/tempscop-printing.ll +++ b/polly/test/ScopInfo/tempscop-printing.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; diff --git a/polly/test/ScopInfo/test-wrapping-in-condition.ll b/polly/test/ScopInfo/test-wrapping-in-condition.ll index 746350422d6b9..d64bdf985c1d2 100644 --- a/polly/test/ScopInfo/test-wrapping-in-condition.ll +++ b/polly/test/ScopInfo/test-wrapping-in-condition.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invalid Context: ; CHECK: [N] -> { : N >= 129 } diff --git a/polly/test/ScopInfo/truncate-1.ll b/polly/test/ScopInfo/truncate-1.ll index 44222c88dfa77..d531dd8e5ab08 100644 --- a/polly/test/ScopInfo/truncate-1.ll +++ b/polly/test/ScopInfo/truncate-1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(char *A, short N) { ; for (char i = 0; i < (char)N; i++) diff --git a/polly/test/ScopInfo/truncate-2.ll b/polly/test/ScopInfo/truncate-2.ll index c78a5337fdeba..3f5d1faf4c377 100644 --- a/polly/test/ScopInfo/truncate-2.ll +++ b/polly/test/ScopInfo/truncate-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(char *A, short N) { ; for (short i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/truncate-3.ll b/polly/test/ScopInfo/truncate-3.ll index 5a80a873cd476..d20f375b9a2bd 100644 --- a/polly/test/ScopInfo/truncate-3.ll +++ b/polly/test/ScopInfo/truncate-3.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \ -; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Signed-unsigned restriction: [p] -> { : p <= -129 or p >= 128 } diff --git a/polly/test/ScopInfo/two-loops-one-infinite.ll b/polly/test/ScopInfo/two-loops-one-infinite.ll index e2723a8a9a2e9..aa2be1003adcc 100644 --- a/polly/test/ScopInfo/two-loops-one-infinite.ll +++ b/polly/test/ScopInfo/two-loops-one-infinite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we do not create a SCoP in the presence of infinite loops. ; diff --git a/polly/test/ScopInfo/two-loops-right-after-each-other.ll b/polly/test/ScopInfo/two-loops-right-after-each-other.ll index 51f3c2d6eb875..163642d9072e2 100644 --- a/polly/test/ScopInfo/two-loops-right-after-each-other.ll +++ b/polly/test/ScopInfo/two-loops-right-after-each-other.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Statements { ; CHECK-NEXT: Stmt_loop_1 diff --git a/polly/test/ScopInfo/undef_in_cond.ll b/polly/test/ScopInfo/undef_in_cond.ll index ef117612f6cb3..5fb08f82b3267 100644 --- a/polly/test/ScopInfo/undef_in_cond.ll +++ b/polly/test/ScopInfo/undef_in_cond.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define fastcc void @fix_operands() nounwind { diff --git a/polly/test/ScopInfo/unnamed_nonaffine.ll b/polly/test/ScopInfo/unnamed_nonaffine.ll index 5b9f980591777..11418499702df 100644 --- a/polly/test/ScopInfo/unnamed_nonaffine.ll +++ b/polly/test/ScopInfo/unnamed_nonaffine.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNNAMED +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNNAMED ; ; void f(int *A, int b) { ; int x; diff --git a/polly/test/ScopInfo/unnamed_stmts.ll b/polly/test/ScopInfo/unnamed_stmts.ll index 163170ce74895..e23b3ae5404b5 100644 --- a/polly/test/ScopInfo/unnamed_stmts.ll +++ b/polly/test/ScopInfo/unnamed_stmts.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; This test case verifies that we generate numbered statement names in case ; no LLVM-IR names are used in the test case. We also verify, that we diff --git a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll index daa1f8c783870..5bc136658ccab 100644 --- a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll +++ b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; Derived from test-suite/MultiSource/Applications/sgefa/blas.c ; ; The exit value of %i.0320 in land.rhs is not computable. diff --git a/polly/test/ScopInfo/unprofitable_scalar-accs.ll b/polly/test/ScopInfo/unprofitable_scalar-accs.ll index ca8daa4de01a6..3f6bb937ded1a 100644 --- a/polly/test/ScopInfo/unprofitable_scalar-accs.ll +++ b/polly/test/ScopInfo/unprofitable_scalar-accs.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=HEURISTIC +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=HEURISTIC ; Check the effect of -polly-unprofitable-scalar-accs diff --git a/polly/test/ScopInfo/unsigned-condition.ll b/polly/test/ScopInfo/unsigned-condition.ll index 0529ded1f6cfb..608b6d6e50a36 100644 --- a/polly/test/ScopInfo/unsigned-condition.ll +++ b/polly/test/ScopInfo/unsigned-condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], int N, unsigned P) { ; int i; diff --git a/polly/test/ScopInfo/unsigned-division-1.ll b/polly/test/ScopInfo/unsigned-division-1.ll index 1c06b55300b67..58d39dc239ac9 100644 --- a/polly/test/ScopInfo/unsigned-division-1.ll +++ b/polly/test/ScopInfo/unsigned-division-1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, unsigned N) { ; for (unsigned i = 0; i < N / 2; i++) diff --git a/polly/test/ScopInfo/unsigned-division-2.ll b/polly/test/ScopInfo/unsigned-division-2.ll index 153639c42b384..cda666d6f5ebf 100644 --- a/polly/test/ScopInfo/unsigned-division-2.ll +++ b/polly/test/ScopInfo/unsigned-division-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, unsigned N) { ; for (unsigned i = 0; i < N / 2 + 3; i++) diff --git a/polly/test/ScopInfo/unsigned-division-3.ll b/polly/test/ScopInfo/unsigned-division-3.ll index 34561fc4645cc..50de3c59892e7 100644 --- a/polly/test/ScopInfo/unsigned-division-3.ll +++ b/polly/test/ScopInfo/unsigned-division-3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, unsigned char N) { ; for (unsigned i = 0; i <= N / -128; i++) diff --git a/polly/test/ScopInfo/unsigned-division-4.ll b/polly/test/ScopInfo/unsigned-division-4.ll index be539b47123bc..4dd75e526407d 100644 --- a/polly/test/ScopInfo/unsigned-division-4.ll +++ b/polly/test/ScopInfo/unsigned-division-4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, unsigned char N) { ; for (unsigned i = 0; i < (N / -128) + 3; i++) diff --git a/polly/test/ScopInfo/unsigned-division-5.ll b/polly/test/ScopInfo/unsigned-division-5.ll index 61716ecec0d90..fff131292271a 100644 --- a/polly/test/ScopInfo/unsigned-division-5.ll +++ b/polly/test/ScopInfo/unsigned-division-5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, unsigned N) { ; for (unsigned i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/unsigned_wrap_uge.ll b/polly/test/ScopInfo/unsigned_wrap_uge.ll index d25a9576e863a..f54b9bec6e7df 100644 --- a/polly/test/ScopInfo/unsigned_wrap_uge.ll +++ b/polly/test/ScopInfo/unsigned_wrap_uge.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Unsigned wrap-around check. ; diff --git a/polly/test/ScopInfo/unsigned_wrap_ugt.ll b/polly/test/ScopInfo/unsigned_wrap_ugt.ll index 0310fdde6d26e..20afd17f86793 100644 --- a/polly/test/ScopInfo/unsigned_wrap_ugt.ll +++ b/polly/test/ScopInfo/unsigned_wrap_ugt.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Unsigned wrap-around check. ; diff --git a/polly/test/ScopInfo/unsigned_wrap_ule.ll b/polly/test/ScopInfo/unsigned_wrap_ule.ll index 47bfc6065b1a8..6fa6cc12990a3 100644 --- a/polly/test/ScopInfo/unsigned_wrap_ule.ll +++ b/polly/test/ScopInfo/unsigned_wrap_ule.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Unsigned wrap-around check. ; diff --git a/polly/test/ScopInfo/unsigned_wrap_ult.ll b/polly/test/ScopInfo/unsigned_wrap_ult.ll index 1b73c0d6dd7ee..4a3b604d81f0f 100644 --- a/polly/test/ScopInfo/unsigned_wrap_ult.ll +++ b/polly/test/ScopInfo/unsigned_wrap_ult.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; Unsigned wrap-around check. ; diff --git a/polly/test/ScopInfo/user_context.ll b/polly/test/ScopInfo/user_context.ll index 74088120e4015..ce8dd921cec16 100644 --- a/polly/test/ScopInfo/user_context.ll +++ b/polly/test/ScopInfo/user_context.ll @@ -1,7 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-context='[N] -> {: N = 1024}' '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CTX -; RUN: opt %loadNPMPolly -polly-context='[N,M] -> {: 1 = 0}' '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-context='[] -> {: 1 = 0}' '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-polly-context=[N] -> {: N = 1024}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CTX +; RUN: opt %loadNPMPolly '-polly-context=[N,M] -> {: 1 = 0}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-polly-context=[] -> {: 1 = 0}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], int N) { ; int i; diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll index bd13ba8bb6961..c35ed9060e504 100644 --- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll +++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; REMARK: remark: <unknown>:0:0: Use user assumption: [n, b] -> { : n <= 100 or (b = 0 and n >= 101) } ; diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll index 45f59170942ed..2afe99fd2c53b 100644 --- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll +++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Context: ; CHECK-NEXT: [n] -> { : -9223372036854775808 <= n <= 100 } diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll index fb71c75aa75e4..3479558062671 100644 --- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll +++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; REMARK: remark: <unknown>:0:0: SCoP begins here. ; REMARK-NEXT: remark: <unknown>:0:0: Use user assumption: [n] -> { : n <= 100 } diff --git a/polly/test/ScopInfo/user_provided_assumptions.ll b/polly/test/ScopInfo/user_provided_assumptions.ll index 49b23b1e784dc..0bd99ea3fcb35 100644 --- a/polly/test/ScopInfo/user_provided_assumptions.ll +++ b/polly/test/ScopInfo/user_provided_assumptions.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP ; ; CHECK: remark: <unknown>:0:0: SCoP begins here. ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [M, N] -> { : N <= 2147483647 - M } diff --git a/polly/test/ScopInfo/user_provided_assumptions_2.ll b/polly/test/ScopInfo/user_provided_assumptions_2.ll index f8643b68cc63f..1499ab98f7369 100644 --- a/polly/test/ScopInfo/user_provided_assumptions_2.ll +++ b/polly/test/ScopInfo/user_provided_assumptions_2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP ; ; CHECK: remark: <unknown>:0:0: SCoP begins here. ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: { : } diff --git a/polly/test/ScopInfo/user_provided_assumptions_3.ll b/polly/test/ScopInfo/user_provided_assumptions_3.ll index 70f8f359e16cd..aa1f72dddde9d 100644 --- a/polly/test/ScopInfo/user_provided_assumptions_3.ll +++ b/polly/test/ScopInfo/user_provided_assumptions_3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP ; ; CHECK: remark: <unknown>:0:0: SCoP begins here. ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [N] -> { : N >= 2 } diff --git a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll index 3e7883db48fcb..a6eed5df2063e 100644 --- a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll +++ b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ -; RUN: -polly-precise-inbounds -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-precise-inbounds -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: remark: <unknown>:0:0: SCoP begins here. ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [i, N, M] -> { : N <= i or (N > i and N >= 0) } @@ -18,8 +17,7 @@ ; -; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ -; RUN: -polly-precise-inbounds -disable-output < %s 2>&1 -pass-remarks-output=%t.yaml +; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-precise-inbounds -disable-output -pass-remarks-output=%t.yaml < %s 2>&1 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s ; YAML: --- !Analysis ; YAML: Pass: polly-scops diff --git a/polly/test/ScopInfo/variant_base_pointer.ll b/polly/test/ScopInfo/variant_base_pointer.ll index 32cb114fab05a..36beaf5f0f016 100644 --- a/polly/test/ScopInfo/variant_base_pointer.ll +++ b/polly/test/ScopInfo/variant_base_pointer.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -disable-output < %s ; ; %tmp is added to the list of required hoists by -polly-scops and just ; assumed to be hoisted. Only -polly-scops recognizes it to be unhoistable diff --git a/polly/test/ScopInfo/variant_load_empty_domain.ll b/polly/test/ScopInfo/variant_load_empty_domain.ll index 6a28bd0405fdd..5602c443b25d3 100644 --- a/polly/test/ScopInfo/variant_load_empty_domain.ll +++ b/polly/test/ScopInfo/variant_load_empty_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: } diff --git a/polly/test/ScopInfo/wraping_signed_expr_0.ll b/polly/test/ScopInfo/wraping_signed_expr_0.ll index f5f06bfd7d336..3a663f57c2774 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_0.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_0.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, char N, char p) { ; for (char i = 0; i < N; i++) { diff --git a/polly/test/ScopInfo/wraping_signed_expr_1.ll b/polly/test/ScopInfo/wraping_signed_expr_1.ll index e04257acc2010..8963e86bc6157 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_1.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(long *A, long N, long p) { ; for (long i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/wraping_signed_expr_2.ll b/polly/test/ScopInfo/wraping_signed_expr_2.ll index 2511c0d646086..97cb2c05b16a0 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_2.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int N, int p) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/wraping_signed_expr_3.ll b/polly/test/ScopInfo/wraping_signed_expr_3.ll index 2106bdf4c0686..50e2eda2ce574 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_3.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int N, int p) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/wraping_signed_expr_4.ll b/polly/test/ScopInfo/wraping_signed_expr_4.ll index 3ea17f6e266bf..4ddb43a01bf24 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_4.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(char *A, char N, char p) { ; for (char i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/wraping_signed_expr_5.ll b/polly/test/ScopInfo/wraping_signed_expr_5.ll index 90706a3d3bc46..440d32bab72a5 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_5.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; We should not generate runtime check for ((int)r1 + (int)r2) as it is known not ; to overflow. However (p + q) can, thus checks are needed. diff --git a/polly/test/ScopInfo/wraping_signed_expr_6.ll b/polly/test/ScopInfo/wraping_signed_expr_6.ll index 9cf67fc101805..7bec9533440fb 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_6.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_6.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invalid Context: ; CHECK: [N] -> { : N >= 129 } diff --git a/polly/test/ScopInfo/wraping_signed_expr_7.ll b/polly/test/ScopInfo/wraping_signed_expr_7.ll index d18d2b2df3e12..2d836e191f858 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_7.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_7.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invalid Context: ; CHECK: [N] -> { : N >= 129 } diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll index 84626861bd39b..4964a123d0be1 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; This checks that the no-wraps checks will be computed fast as some example ; already showed huge slowdowns even though the inbounds and nsw flags were diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll index b4dd567bafa6b..a6db7c06d072c 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; This checks that the no-wraps checks will be computed fast as some example ; already showed huge slowdowns even though the inbounds and nsw flags were diff --git a/polly/test/ScopInfo/zero_ext_of_truncate.ll b/polly/test/ScopInfo/zero_ext_of_truncate.ll index cbe4af05169f8..b509951bbf0d5 100644 --- a/polly/test/ScopInfo/zero_ext_of_truncate.ll +++ b/polly/test/ScopInfo/zero_ext_of_truncate.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(unsigned *restrict I, unsigned *restrict A, unsigned N, unsigned M) { ; for (unsigned i = 0; i < N; i++) { diff --git a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll index b306045276765..ea3356e01cc9f 100644 --- a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll +++ b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; void f(unsigned long *restrict I, unsigned *restrict A, unsigned N) { ; for (unsigned i = 0; i < N; i++) { diff --git a/polly/test/ScopInfo/zero_ext_space_mismatch.ll b/polly/test/ScopInfo/zero_ext_space_mismatch.ll index 3c02ae295b5ba..9fd1afae4b889 100644 --- a/polly/test/ScopInfo/zero_ext_space_mismatch.ll +++ b/polly/test/ScopInfo/zero_ext_space_mismatch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: [dim] -> { : dim > 0 } diff --git a/polly/test/ScopInliner/ignore-declares.ll b/polly/test/ScopInliner/ignore-declares.ll index 5c0cfa103f0bf..85198b728a9bb 100644 --- a/polly/test/ScopInliner/ignore-declares.ll +++ b/polly/test/ScopInliner/ignore-declares.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s +; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s ; Check that we do not crash if there are declares. We should skip function ; declarations and not try to query for domtree. diff --git a/polly/test/ScopInliner/invariant-load-func.ll b/polly/test/ScopInliner/invariant-load-func.ll index 58c556a455fb9..6046fc0f38650 100644 --- a/polly/test/ScopInliner/invariant-load-func.ll +++ b/polly/test/ScopInliner/invariant-load-func.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s 2>&1 | FileCheck %s ; Check that we inline a function that requires invariant load hoisting ; correctly. diff --git a/polly/test/ScopInliner/simple-inline-loop.ll b/polly/test/ScopInliner/simple-inline-loop.ll index f12798a3d831a..77a5ddda93adc 100644 --- a/polly/test/ScopInliner/simple-inline-loop.ll +++ b/polly/test/ScopInliner/simple-inline-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s | FileCheck %s ; Check that we get the 2 nested loops by inlining `to_be_inlined` into ; `inline_site`. diff --git a/polly/test/Simplify/coalesce_3partials.ll b/polly/test/Simplify/coalesce_3partials.ll index 4112787e51bfa..5411b6e430c66 100644 --- a/polly/test/Simplify/coalesce_3partials.ll +++ b/polly/test/Simplify/coalesce_3partials.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Combine 3 partial accesses into one. ; diff --git a/polly/test/Simplify/coalesce_disjointelements.ll b/polly/test/Simplify/coalesce_disjointelements.ll index b140f287e27f7..888daeff39d8d 100644 --- a/polly/test/Simplify/coalesce_disjointelements.ll +++ b/polly/test/Simplify/coalesce_disjointelements.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Combine four partial stores into two. ; The stores write to the same array, but never the same element. diff --git a/polly/test/Simplify/coalesce_overlapping.ll b/polly/test/Simplify/coalesce_overlapping.ll index ee716fc12f095..f492222461b34 100644 --- a/polly/test/Simplify/coalesce_overlapping.ll +++ b/polly/test/Simplify/coalesce_overlapping.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Combine two partial stores (with overlapping domains) into one. ; diff --git a/polly/test/Simplify/coalesce_partial.ll b/polly/test/Simplify/coalesce_partial.ll index aea691f43e934..4df91d43fc46d 100644 --- a/polly/test/Simplify/coalesce_partial.ll +++ b/polly/test/Simplify/coalesce_partial.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Combine two partial stores (with disjoint domains) into one. ; diff --git a/polly/test/Simplify/dead_access_load.ll b/polly/test/Simplify/dead_access_load.ll index 66f94795ea6e4..399c02381c890 100644 --- a/polly/test/Simplify/dead_access_load.ll +++ b/polly/test/Simplify/dead_access_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Remove a dead load-instruction ; (an load whose result is not used anywhere) diff --git a/polly/test/Simplify/dead_access_phi.ll b/polly/test/Simplify/dead_access_phi.ll index fb40e4cc45b35..9344a284b311a 100644 --- a/polly/test/Simplify/dead_access_phi.ll +++ b/polly/test/Simplify/dead_access_phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Remove a dead PHI write/read pair ; (accesses that are effectively not used) diff --git a/polly/test/Simplify/dead_access_value.ll b/polly/test/Simplify/dead_access_value.ll index a8ff7f28542b7..6db242c97dac0 100644 --- a/polly/test/Simplify/dead_access_value.ll +++ b/polly/test/Simplify/dead_access_value.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Remove a dead value write/read pair ; (accesses that are effectively not used) diff --git a/polly/test/Simplify/dead_instruction.ll b/polly/test/Simplify/dead_instruction.ll index 81e55e1c7bb30..785b5ba154187 100644 --- a/polly/test/Simplify/dead_instruction.ll +++ b/polly/test/Simplify/dead_instruction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Remove a dead instruction ; (an instruction whose result is not used anywhere) diff --git a/polly/test/Simplify/emptyaccessdomain.ll b/polly/test/Simplify/emptyaccessdomain.ll index 9b06cec965a9d..917ae7f7d2c94 100644 --- a/polly/test/Simplify/emptyaccessdomain.ll +++ b/polly/test/Simplify/emptyaccessdomain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; for (int j = 0; j < n; j += 1) { ; A[0] = 42.0; diff --git a/polly/test/Simplify/exit_phi_accesses-2.ll b/polly/test/Simplify/exit_phi_accesses-2.ll index 379c7e0ace0a3..d56fed4848ff3 100644 --- a/polly/test/Simplify/exit_phi_accesses-2.ll +++ b/polly/test/Simplify/exit_phi_accesses-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-scops -polly-print-simplify -disable-output < %s | FileCheck %s ; ; The use of %sum.next by %phi counts as an escaping use. ; Don't remove the scalar write of %sum.next. diff --git a/polly/test/Simplify/func-b320a7.ll b/polly/test/Simplify/func-b320a7.ll index 5aa2caba95cfc..65aa9cd28314e 100644 --- a/polly/test/Simplify/func-b320a7.ll +++ b/polly/test/Simplify/func-b320a7.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>,polly-optree' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<optree;simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines ; llvm.org/PR47098 ; Use-after-free by reference to Stmt remaining in InstStmtMap after removing it has been removed by Scop::simplifyScop. diff --git a/polly/test/Simplify/gemm.ll b/polly/test/Simplify/gemm.ll index 5120de2db7677..6e3a43e0ebbad 100644 --- a/polly/test/Simplify/gemm.ll +++ b/polly/test/Simplify/gemm.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s ; ; void gemm(float A[][1024], float B[][1024], float C[][1024]) { ; for (long i = 0; i < 1024; i++) diff --git a/polly/test/Simplify/nocoalesce_differentvalues.ll b/polly/test/Simplify/nocoalesce_differentvalues.ll index 33d04b2f96de8..cba62549227ae 100644 --- a/polly/test/Simplify/nocoalesce_differentvalues.ll +++ b/polly/test/Simplify/nocoalesce_differentvalues.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Do not combine stores that write different values. ; diff --git a/polly/test/Simplify/nocoalesce_elementmismatch.ll b/polly/test/Simplify/nocoalesce_elementmismatch.ll index 608b055e691df..b589d13779e52 100644 --- a/polly/test/Simplify/nocoalesce_elementmismatch.ll +++ b/polly/test/Simplify/nocoalesce_elementmismatch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Do not combine stores that do not write to different elements in the ; same instance. diff --git a/polly/test/Simplify/nocoalesce_readbetween.ll b/polly/test/Simplify/nocoalesce_readbetween.ll index e112b036cd778..b61ad9d8031e0 100644 --- a/polly/test/Simplify/nocoalesce_readbetween.ll +++ b/polly/test/Simplify/nocoalesce_readbetween.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Do not combine stores if there is a read between them. ; Note: The read between is unused, so will be removed by markAndSweep. diff --git a/polly/test/Simplify/nocoalesce_writebetween.ll b/polly/test/Simplify/nocoalesce_writebetween.ll index fd5eee52eaf5c..be7d159554034 100644 --- a/polly/test/Simplify/nocoalesce_writebetween.ll +++ b/polly/test/Simplify/nocoalesce_writebetween.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Do not combine stores if there is a write between them. ; diff --git a/polly/test/Simplify/notdead_region_exitphi.ll b/polly/test/Simplify/notdead_region_exitphi.ll index 42fafb446cea3..1bd9bfe10a99d 100644 --- a/polly/test/Simplify/notdead_region_exitphi.ll +++ b/polly/test/Simplify/notdead_region_exitphi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Do not remove dependencies of a phi node in a region's exit block. ; diff --git a/polly/test/Simplify/notdead_region_innerphi.ll b/polly/test/Simplify/notdead_region_innerphi.ll index 966448c9884b2..b59d6dc60b089 100644 --- a/polly/test/Simplify/notdead_region_innerphi.ll +++ b/polly/test/Simplify/notdead_region_innerphi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Do not remove dependencies of a phi node within a region statement (%phi). ; diff --git a/polly/test/Simplify/notredundant_region_loop.ll b/polly/test/Simplify/notredundant_region_loop.ll index 88f6c41521739..859bd459f72d6 100644 --- a/polly/test/Simplify/notredundant_region_loop.ll +++ b/polly/test/Simplify/notredundant_region_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -disable-output < %s | FileCheck %s -match-full-lines ; ; Do not remove the store in region_entry. It can be executed multiple times ; due to being part of a non-affine loop. diff --git a/polly/test/Simplify/notredundant_region_middle.ll b/polly/test/Simplify/notredundant_region_middle.ll index 43c05436809ba..a742ea889fb1f 100644 --- a/polly/test/Simplify/notredundant_region_middle.ll +++ b/polly/test/Simplify/notredundant_region_middle.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Do not remove redundant stores in the middle of region statements. ; The store in region_true could be removed, but in practice we do try to diff --git a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll index 8a9aec8be9e05..8542b7927f860 100644 --- a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll +++ b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Do not remove the scalar value write of %i.trunc in inner.for. ; It is used by body. diff --git a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll index 7218f328f9ca3..06b082c3f81fa 100644 --- a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll +++ b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-scops -polly-print-simplify -disable-output < %s 2>&1 | FileCheck %s ; ; %tmp5 must keep the Value WRITE MemoryAccess, because as an incoming value of ; %tmp4, it is an "external use". diff --git a/polly/test/Simplify/overwritten.ll b/polly/test/Simplify/overwritten.ll index eccdd8044d073..bc5b2dffd443d 100644 --- a/polly/test/Simplify/overwritten.ll +++ b/polly/test/Simplify/overwritten.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s ; ; Remove a store that is overwritten by another store in the same statement. ; diff --git a/polly/test/Simplify/overwritten_3phi.ll b/polly/test/Simplify/overwritten_3phi.ll index 4cee4f13d26d0..861c9acda3e9c 100644 --- a/polly/test/Simplify/overwritten_3phi.ll +++ b/polly/test/Simplify/overwritten_3phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Remove identical writes ; (two stores in the same statement that write the same value to the same diff --git a/polly/test/Simplify/overwritten_3store.ll b/polly/test/Simplify/overwritten_3store.ll index c9f06c85dba53..cfd5a08143d60 100644 --- a/polly/test/Simplify/overwritten_3store.ll +++ b/polly/test/Simplify/overwritten_3store.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s ; ; Remove a store that is overwritten by another store in the same statement. ; Check that even multiple stores are removed. diff --git a/polly/test/Simplify/overwritten_implicit_and_explicit.ll b/polly/test/Simplify/overwritten_implicit_and_explicit.ll index b1b7635e26263..306e726e7808a 100644 --- a/polly/test/Simplify/overwritten_implicit_and_explicit.ll +++ b/polly/test/Simplify/overwritten_implicit_and_explicit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Remove a store that is overwritten by another store in the same statement. ; Check that this works even if one of the writes is a scalar MemoryKind. diff --git a/polly/test/Simplify/overwritten_loadbetween.ll b/polly/test/Simplify/overwritten_loadbetween.ll index cdca2f11531e7..170838ddb8a1a 100644 --- a/polly/test/Simplify/overwritten_loadbetween.ll +++ b/polly/test/Simplify/overwritten_loadbetween.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s ; ; Do not remove overwrites when the value is read before. ; diff --git a/polly/test/Simplify/overwritten_scalar.ll b/polly/test/Simplify/overwritten_scalar.ll index 700adb6aed2ec..a1e7da40554d5 100644 --- a/polly/test/Simplify/overwritten_scalar.ll +++ b/polly/test/Simplify/overwritten_scalar.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Remove identical writes ; (two stores in the same statement that write the same value to the same diff --git a/polly/test/Simplify/pass_existence.ll b/polly/test/Simplify/pass_existence.ll index 4d1d800b2a80b..6d9c99f9dc270 100644 --- a/polly/test/Simplify/pass_existence.ll +++ b/polly/test/Simplify/pass_existence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -disable-output "-passes=scop(print<polly-simplify>)" < %s -aa-pipeline=basic-aa < %s | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<simplify>' -polly-print-simplify -aa-pipeline=basic-aa < %s < %s | FileCheck %s ; ; Simple test for the existence of the Simplify pass. ; diff --git a/polly/test/Simplify/phi_in_regionstmt.ll b/polly/test/Simplify/phi_in_regionstmt.ll index 2bb05738955a3..ba1cffee1a0df 100644 --- a/polly/test/Simplify/phi_in_regionstmt.ll +++ b/polly/test/Simplify/phi_in_regionstmt.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; The PHINode %cond91.sink.sink.us.sink.6 is in the middle of a region ; statement. diff --git a/polly/test/Simplify/pr33323.ll b/polly/test/Simplify/pr33323.ll index 22921d5fba509..5130eb8488ca2 100644 --- a/polly/test/Simplify/pr33323.ll +++ b/polly/test/Simplify/pr33323.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s ; ; llvm.org/PR33323 ; diff --git a/polly/test/Simplify/redundant.ll b/polly/test/Simplify/redundant.ll index 540e537460e54..f2489a74eb899 100644 --- a/polly/test/Simplify/redundant.ll +++ b/polly/test/Simplify/redundant.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Remove redundant store (a store that writes the same value already ; at the destination) diff --git a/polly/test/Simplify/redundant_differentindex.ll b/polly/test/Simplify/redundant_differentindex.ll index 5ce25836dedbd..efd20e90ae748 100644 --- a/polly/test/Simplify/redundant_differentindex.ll +++ b/polly/test/Simplify/redundant_differentindex.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; A store that has a different index than the load it is storing is ; not redundant. diff --git a/polly/test/Simplify/redundant_partialwrite.ll b/polly/test/Simplify/redundant_partialwrite.ll index ac5ca907fff6f..357b63206b0f5 100644 --- a/polly/test/Simplify/redundant_partialwrite.ll +++ b/polly/test/Simplify/redundant_partialwrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadPolly -polly-import-jscop-postfix=transformed -polly-print-import-jscop -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;simplify>' -polly-print-import-jscop -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines ; ; Remove a redundant store, if its partial domain is a subset of the ; read's domain. diff --git a/polly/test/Simplify/redundant_region.ll b/polly/test/Simplify/redundant_region.ll index 927aac6c4af05..c60d28b7039dd 100644 --- a/polly/test/Simplify/redundant_region.ll +++ b/polly/test/Simplify/redundant_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; Remove redundant store (a store that writes the same value already ; at the destination) in a region. diff --git a/polly/test/Simplify/redundant_region_scalar.ll b/polly/test/Simplify/redundant_region_scalar.ll index 72d570d46bdce..3de50c04b614f 100644 --- a/polly/test/Simplify/redundant_region_scalar.ll +++ b/polly/test/Simplify/redundant_region_scalar.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; Remove redundant store (a store that writes the same value already ; at the destination) in a region. diff --git a/polly/test/Simplify/redundant_scalarwrite.ll b/polly/test/Simplify/redundant_scalarwrite.ll index 84cb971be11fd..13ca40f8e1b87 100644 --- a/polly/test/Simplify/redundant_scalarwrite.ll +++ b/polly/test/Simplify/redundant_scalarwrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; Remove redundant scalar stores. ; diff --git a/polly/test/Simplify/redundant_storebetween.ll b/polly/test/Simplify/redundant_storebetween.ll index 6540d7751e469..47d9cfde2d3ce 100644 --- a/polly/test/Simplify/redundant_storebetween.ll +++ b/polly/test/Simplify/redundant_storebetween.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Don't remove store where there is another store to the same target ; in-between them. diff --git a/polly/test/Simplify/scalability1.ll b/polly/test/Simplify/scalability1.ll index c6e36f9dcdefb..969aade275af2 100644 --- a/polly/test/Simplify/scalability1.ll +++ b/polly/test/Simplify/scalability1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines ; ; Test scalability. ; diff --git a/polly/test/Simplify/scalability2.ll b/polly/test/Simplify/scalability2.ll index adcf9eef348a9..7951094867f2f 100644 --- a/polly/test/Simplify/scalability2.ll +++ b/polly/test/Simplify/scalability2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines ; ; Test scalability. ; diff --git a/polly/test/Simplify/sweep_mapped_phi.ll b/polly/test/Simplify/sweep_mapped_phi.ll index 495d77a22f618..ad41f2566e2b5 100644 --- a/polly/test/Simplify/sweep_mapped_phi.ll +++ b/polly/test/Simplify/sweep_mapped_phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; Map %phi to A[j], so the scalar write in Stmt_for_bodyA can be removed. ; diff --git a/polly/test/Simplify/sweep_mapped_value.ll b/polly/test/Simplify/sweep_mapped_value.ll index c83941a8f0ba5..a50c013ac7917 100644 --- a/polly/test/Simplify/sweep_mapped_value.ll +++ b/polly/test/Simplify/sweep_mapped_value.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; Map %val to A[j], so the scalar write on Stmt_for_bodyB can be removed. ; diff --git a/polly/test/Simplify/ununsed_read_in_region_entry.ll b/polly/test/Simplify/ununsed_read_in_region_entry.ll index f2436c263a96a..4c05de975fdf8 100644 --- a/polly/test/Simplify/ununsed_read_in_region_entry.ll +++ b/polly/test/Simplify/ununsed_read_in_region_entry.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>' -disable-output< %s | FileCheck %s -match-full-lines -; RUN: opt %loadNPMPolly '-passes=polly-simplify,polly-codegen' -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;simplify>' -S < %s | FileCheck %s -check-prefix=CODEGEN ; ; for (int i = 0; i < n; i+=1) { ; (void)A[0]; diff --git a/polly/test/Support/Plugins.ll b/polly/test/Support/Plugins.ll index 872a32fad4fed..b75dd872ad404 100644 --- a/polly/test/Support/Plugins.ll +++ b/polly/test/Support/Plugins.ll @@ -1,5 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -S < %s \ -; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;ast>' -polly-print-ast -S < %s | FileCheck %s ; This testcase tests plugin registration. Check-lines below serve to verify ; that the passes actually ran. diff --git a/polly/test/Support/exportjson.ll b/polly/test/Support/exportjson.ll index 22cfea23534cb..6bdf5a4c33cf3 100644 --- a/polly/test/Support/exportjson.ll +++ b/polly/test/Support/exportjson.ll @@ -1,6 +1,6 @@ ; RUN: rm -rf %t ; RUN: mkdir -p %t -; RUN: opt %loadNPMPolly -polly-import-jscop-dir=%t -polly -O2 -polly-export -S < %s +; RUN: opt %loadNPMPolly -polly-import-jscop-dir=%t '-passes=polly-custom<export-jscop>' -disable-output < %s ; RUN: FileCheck %s -input-file %t/exportjson___%entry.split---%return.jscop ; ; for (int j = 0; j < n; j += 1) { @@ -9,28 +9,22 @@ ; define void @exportjson(i32 %n, ptr noalias nonnull %A) { entry: - br label %for + br label %entry.split -for: - %j = phi i32 [0, %entry], [%j.inc, %inc] - %j.cmp = icmp slt i32 %j, %n - br i1 %j.cmp, label %body, label %exit +entry.split: + %j.cmp1 = icmp sgt i32 %n, 0 + br i1 %j.cmp1, label %body.lr.ph, label %return - body: - store double 42.0, ptr %A - br label %inc - -inc: - %j.inc = add nuw nsw i32 %j, 1 - br label %for - -exit: +body.lr.ph: + store double 4.200000e+01, ptr %A, align 8 br label %return return: ret void } +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } + ; CHECK: { ; CHECK-NEXT: "arrays": [ diff --git a/polly/test/Support/isl-args.ll b/polly/test/Support/isl-args.ll index 206cb73bfc5ab..6c8b2e97682e8 100644 --- a/polly/test/Support/isl-args.ll +++ b/polly/test/Support/isl-args.ll @@ -1,7 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP -; RUN: not opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-asdf < %s 2>&1| FileCheck %s -match-full-lines --check-prefix=UNKNOWN -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP +; RUN: not opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-asdf < %s 2>&1 | FileCheck %s -match-full-lines --check-prefix=UNKNOWN +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s ; VERSION: isl-{{.*}}-IMath-32 ; HELP: Usage: -polly-isl-arg [OPTION...] diff --git a/polly/test/Support/pipelineposition.ll b/polly/test/Support/pipelineposition.ll index a4506ba1d64ed..1ddfb5879ce16 100644 --- a/polly/test/Support/pipelineposition.ll +++ b/polly/test/Support/pipelineposition.ll @@ -1,8 +1,6 @@ -; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=NOINLINE -; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -polly-run-inliner -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED1 -; RUN: opt %loadNPMPolly -O3 -polly -polly-position=before-vectorizer -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED3 -; -; REQUIRES: asserts +; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=NOINLINE +; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -polly-run-inliner -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED1 +; RUN: opt %loadNPMPolly -O3 -polly -polly-position=before-vectorizer -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED3 ; ; void callee(int n, double A[], int i) { ; for (int j = 0; j < n; j += 1) diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in index f22063e796def..ca901b8825ced 100644 --- a/polly/test/lit.site.cfg.in +++ b/polly/test/lit.site.cfg.in @@ -38,14 +38,10 @@ if config.llvm_polly_link_into_tools == '' or \ config.llvm_polly_link_into_tools.lower() == 'false' or \ config.llvm_polly_link_into_tools.lower() == 'notfound' or \ config.llvm_polly_link_into_tools.lower() == 'llvm_polly_link_into_tools-notfound': - config.substitutions.append(('%loadPolly', '-load ' - + config.polly_lib_dir + '/LLVMPolly@LLVM_SHLIBEXT@' - + commonOpts )) config.substitutions.append(('%loadNPMPolly', '-load-pass-plugin ' + config.polly_lib_dir + '/LLVMPolly@LLVM_SHLIBEXT@' + commonOpts )) else: - config.substitutions.append(('%loadPolly', commonOpts )) config.substitutions.append(('%loadNPMPolly', commonOpts )) import lit.llvm diff --git a/polly/test/polly.ll b/polly/test/polly.ll index 2e455b39a9cd4..0f5467b0e654d 100644 --- a/polly/test/polly.ll +++ b/polly/test/polly.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -S < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -S < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @foo() nounwind { start: From cf11ff9084ca703a2e32e396696007cd0799086f Mon Sep 17 00:00:00 2001 From: Mircea Trofin <mtrofin@google.com> Date: Mon, 3 Nov 2025 14:35:01 -0800 Subject: [PATCH 094/313] Exclude another StructurizeCFG test from profcheck (#166248) Haven't yet addressed this pass --- llvm/utils/profcheck-xfail.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index 61bc936cd151a..d7af3a7ecbdee 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -1297,6 +1297,7 @@ Transforms/SimpleLoopUnswitch/trivial-unswitch.ll Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll +Transforms/StructurizeCFG/callbr.ll Transforms/StructurizeCFG/hoist-zerocost.ll Transforms/StructurizeCFG/loop-break-phi.ll Transforms/StructurizeCFG/nested-loop-order.ll From 01221874e41e0cba9161fda3601d10aa36537512 Mon Sep 17 00:00:00 2001 From: Mircea Trofin <mtrofin@google.com> Date: Mon, 3 Nov 2025 14:37:41 -0800 Subject: [PATCH 095/313] [SLU][profcheck] Use the original branch weigths in `buildPartialInvariantUnswitchConditionalBranch` (#164270) A new branch is created on the same condition as a branch for which we have a profile. We can reuse that profile in this case. Issue #147390 --- .../Transforms/Scalar/SimpleLoopUnswitch.cpp | 19 ++++- .../SimpleLoopUnswitch/partial-unswitch.ll | 75 +++++++++++-------- 2 files changed, 58 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index bb6c879f4d47e..239526e85e1fd 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -337,7 +337,7 @@ static void buildPartialUnswitchConditionalBranch( static void buildPartialInvariantUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction, BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) { ValueToValueMapTy VMap; for (auto *Val : reverse(ToDuplicate)) { Instruction *Inst = cast<Instruction>(Val); @@ -377,8 +377,19 @@ static void buildPartialInvariantUnswitchConditionalBranch( IRBuilder<> IRB(&BB); IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); Value *Cond = VMap[ToDuplicate[0]]; - IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, - Direction ? &NormalSucc : &UnswitchedSucc); + // The expectation is that ToDuplicate[0] is the condition used by the + // OriginalBranch, case in which we can clone the profile metadata from there. + auto *ProfData = + !ProfcheckDisableMetadataFixes && + ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition()) + ? OriginalBranch.getMetadata(LLVMContext::MD_prof) + : nullptr; + auto *BR = + IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc, ProfData); + if (!ProfData) + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(), + DEBUG_TYPE); } /// Rewrite the PHI nodes in an unswitched loop exit basic block. @@ -2515,7 +2526,7 @@ static void unswitchNontrivialInvariants( // the branch in the split block. if (PartiallyInvariant) buildPartialInvariantUnswitchConditionalBranch( - *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); + *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI); else { buildPartialUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll index 1d8942079ffd8..87161707d9f69 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll @@ -1,14 +1,14 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s declare void @clobber() -define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { +define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) !prof !0 { ; CHECK-LABEL: @partial_unswitch_true_successor( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 100 -; CHECK-NEXT: br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] +; CHECK-NEXT: br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: entry.split.us: ; CHECK-NEXT: br label [[LOOP_HEADER_US:%.*]] ; CHECK: loop.header.us: @@ -19,7 +19,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !prof [[PROF2:![0-9]+]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -28,7 +28,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[PTR]], align 4 ; CHECK-NEXT: [[SC:%.*]] = icmp eq i32 [[LV]], 100 -; CHECK-NEXT: br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]] +; CHECK-NEXT: br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]], !prof [[PROF1]] ; CHECK: noclobber: ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: clobber: @@ -37,7 +37,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !prof [[PROF2]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -50,7 +50,7 @@ loop.header: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] %lv = load i32, ptr %ptr %sc = icmp eq i32 %lv, 100 - br i1 %sc, label %noclobber, label %clobber + br i1 %sc, label %noclobber, label %clobber, !prof !1 noclobber: br label %loop.latch @@ -62,7 +62,7 @@ clobber: loop.latch: %c = icmp ult i32 %iv, %N %iv.next = add i32 %iv, 1 - br i1 %c, label %loop.header, label %exit + br i1 %c, label %loop.header, label %exit, !prof !2 exit: ret i32 10 @@ -102,7 +102,7 @@ define i32 @partial_unswitch_false_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -171,7 +171,7 @@ define i32 @partial_unswtich_gep_load_icmp(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -246,7 +246,7 @@ define i32 @partial_unswitch_reduction_phi(ptr %ptr, i32 %N) { ; CHECK-NEXT: [[RED_NEXT]] = phi i32 [ [[ADD_5]], [[CLOBBER]] ], [ [[ADD_10]], [[NOCLOBBER]] ] ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP_LATCH]] ] ; CHECK-NEXT: br label [[EXIT]] @@ -325,7 +325,7 @@ define i32 @partial_unswitch_true_successor_noclobber(ptr noalias %ptr.1, ptr no ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -637,7 +637,7 @@ define i32 @partial_unswitch_true_successor_preheader_insertion(ptr %ptr, i32 %N ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit.loopexit.split: ; CHECK-NEXT: br label [[EXIT_LOOPEXIT]] ; CHECK: exit.loopexit: @@ -713,7 +713,7 @@ define i32 @partial_unswitch_true_successor_insert_point(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -784,7 +784,7 @@ define i32 @partial_unswitch_true_successor_hoist_invariant(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1073,7 +1073,7 @@ define i32 @partial_unswitch_true_to_latch(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1138,7 +1138,7 @@ define i32 @partial_unswitch_exiting_block_with_multiple_unswitch_candidates(i32 ; CHECK-NEXT: store i32 [[TMP1:%.*]], ptr [[PTR]], align 16 ; CHECK-NEXT: br label [[EXITING]] ; CHECK: exiting: -; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: [[RET_VAL:%.*]] = phi i32 [ 1, [[EXITING]] ] ; CHECK-NEXT: br label [[EXIT]] @@ -1249,7 +1249,7 @@ define i32 @partial_unswitch_true_successor_for_cost_calculation(ptr %ptr, i32 % ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1360,7 +1360,7 @@ define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1425,7 +1425,7 @@ define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1456,15 +1456,26 @@ exit: ret i32 10 } -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]} -; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"} -; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[UNSWITCH_PARTIAL_DISABLE]]} +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 1000, i32 1} +!2 = !{!"branch_weights", i32 100, i32 3} + +;. +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1000, i32 1} +; CHECK: [[PROF2]] = !{!"branch_weights", i32 100, i32 3} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]} +; CHECK: [[META4]] = !{!"llvm.loop.unswitch.partial.disable"} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META4]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META4]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]]} +;. From 148a42bdd2f252b4366b79fc518356bb06cacac3 Mon Sep 17 00:00:00 2001 From: Andrew Haberlandt <ndrewh@users.noreply.github.com> Date: Mon, 3 Nov 2025 14:40:01 -0800 Subject: [PATCH 096/313] [sanitizer-common] [Darwin] Provide warnings for common sandbox issues (#165907) We currently do not handle errors in task_set_exc_guard_behavior. If this fails, mmap can unexpectedly crash. We also do not currently provide a clear warning if no external symbolizers are found. rdar://163798535 --- compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp | 12 +++++++++++- .../sanitizer_symbolizer_posix_libcdep.cpp | 7 +++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp index b0a29db908639..90c0b66f81b5b 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp @@ -960,7 +960,17 @@ static void DisableMmapExcGuardExceptions() { RTLD_DEFAULT, "task_set_exc_guard_behavior"); if (set_behavior == nullptr) return; const task_exc_guard_behavior_t task_exc_guard_none = 0; - set_behavior(mach_task_self(), task_exc_guard_none); + kern_return_t res = set_behavior(mach_task_self(), task_exc_guard_none); + if (res != KERN_SUCCESS) { + Report( + "WARN: task_set_exc_guard_behavior returned %d (%s), " + "mmap may fail unexpectedly.\n", + res, mach_error_string(res)); + if (res == KERN_DENIED) + Report( + "HINT: Check that task_set_exc_guard_behavior is allowed by " + "sandbox.\n"); + } } static void VerifyInterceptorsWorking(); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp index f8d821e125b7a..7eb0c9756d64a 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp @@ -505,6 +505,13 @@ static void ChooseSymbolizerTools(IntrusiveList<SymbolizerTool> *list, } # if SANITIZER_APPLE + if (list->empty()) { + Report( + "WARN: No external symbolizers found. Symbols may be missing or " + "unreliable.\n"); + Report( + "HINT: Is PATH set? Does sandbox allow file-read of /usr/bin/atos?\n"); + } VReport(2, "Using dladdr symbolizer.\n"); list->push_back(new (*allocator) DlAddrSymbolizer()); # endif // SANITIZER_APPLE From c0cb5133489fa746ae5de43a17991778281fe4c9 Mon Sep 17 00:00:00 2001 From: Sam Clegg <sbc@chromium.org> Date: Mon, 3 Nov 2025 14:42:05 -0800 Subject: [PATCH 097/313] [lld][WebAssembly] Use writePtrConst helper function (#166228) This is especially important for writing i32 values larger than 2gb which need to be encoded as negative SLEB vales in the binary. Without this change offsets over 2gb are wrongly encoded and cause validation errors. Fixes: https://github.com/emscripten-core/emscripten/issues/25706 --- lld/test/wasm/runtime-relocations-himem.s | 60 +++++++++++++++++++++++ lld/wasm/InputChunks.cpp | 14 ++---- lld/wasm/SyntheticSections.cpp | 8 +-- 3 files changed, 66 insertions(+), 16 deletions(-) create mode 100644 lld/test/wasm/runtime-relocations-himem.s diff --git a/lld/test/wasm/runtime-relocations-himem.s b/lld/test/wasm/runtime-relocations-himem.s new file mode 100644 index 0000000000000..a12a93a6cb933 --- /dev/null +++ b/lld/test/wasm/runtime-relocations-himem.s @@ -0,0 +1,60 @@ +## Verifies runtime relocation code for addresses over 2gb works correctly. +## We have had issues with LEB encoding of address over 2gb in i32.const +## instruction leading to invalid binaries. + +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s +# RUN: wasm-ld --global-base=2147483648 --experimental-pic --unresolved-symbols=import-dynamic -no-gc-sections --shared-memory --no-entry -o %t.wasm %t.o +# XUN: obj2yaml %t.wasm | FileCheck %s +# RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s -- + +.globl tls_sym +.globl data_sym +.globl _start +.globaltype __tls_base, i32 + +_start: + .functype _start () -> () + global.get __tls_base + i32.const tls_sym@TLSREL + i32.add + drop + i32.const data_sym + drop + end_function + +.section tls_sec,"T",@ +.p2align 2 +tls_sym: + .int32 0 + .int32 extern_sym + .size tls_sym, 8 + +.section data_sec,"",@ +.p2align 2 +data_sym: + .int32 0 + .int32 extern_sym + .size data_sym, 8 + +.section .custom_section.target_features,"",@ + .int8 2 + .int8 43 + .int8 7 + .ascii "atomics" + .int8 43 + .int8 11 + .ascii "bulk-memory" + +# CHECK: <__wasm_apply_data_relocs>: +# CHECK-EMPTY: +# CHECK-NEXT: i32.const -2147483636 +# CHECK-NEXT: global.get 0 +# CHECK-NEXT: i32.store 0 +# CHECK-NEXT: end + +# CHECK: <__wasm_apply_tls_relocs>: +# CHECK-EMPTY: +# CHECK-NEXT: i32.const -2147483644 +# CHECK-NEXT: global.get 0 +# CHECK-NEXT: i32.store 0 +# CHECK-NEXT: end diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp index 44927e7a432bc..14e02e6009318 100644 --- a/lld/wasm/InputChunks.cpp +++ b/lld/wasm/InputChunks.cpp @@ -423,8 +423,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { bool is64 = ctx.arg.is64.value_or(false); bool generated = false; - unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST - : WASM_OPCODE_I32_CONST; unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD; @@ -451,8 +449,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { << " output offset=" << offset << "\n"); // Calculate the address at which to apply the relocation - writeU8(os, opcode_ptr_const, "CONST"); - writeSleb128(os, offset, "offset"); + writePtrConst(os, offset, is64, "offset"); // In PIC mode we need to add the __memory_base if (ctx.isPic) { @@ -466,8 +463,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { // Now figure out what we want to store at this location bool is64 = relocIs64(rel.Type); - unsigned opcode_reloc_const = - is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST; unsigned opcode_reloc_add = is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD; unsigned opcode_reloc_store = @@ -477,8 +472,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET"); writeUleb128(os, sym->getGOTIndex(), "global index"); if (rel.Addend) { - writeU8(os, opcode_reloc_const, "CONST"); - writeSleb128(os, rel.Addend, "addend"); + writePtrConst(os, rel.Addend, is64, "addend"); writeU8(os, opcode_reloc_add, "ADD"); } } else { @@ -491,8 +485,8 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { baseSymbol = ctx.sym.tlsBase; writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET"); writeUleb128(os, baseSymbol->getGlobalIndex(), "base"); - writeU8(os, opcode_reloc_const, "CONST"); - writeSleb128(os, file->calcNewValue(rel, tombstone, this), "offset"); + writePtrConst(os, file->calcNewValue(rel, tombstone, this), is64, + "offset"); writeU8(os, opcode_reloc_add, "ADD"); } diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp index e1192706ea913..399a5084e6595 100644 --- a/lld/wasm/SyntheticSections.cpp +++ b/lld/wasm/SyntheticSections.cpp @@ -434,8 +434,6 @@ void GlobalSection::addInternalGOTEntry(Symbol *sym) { void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { assert(!ctx.arg.extendedConst); bool is64 = ctx.arg.is64.value_or(false); - unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST - : WASM_OPCODE_I32_CONST; unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD; @@ -452,8 +450,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(), "__memory_base"); // Add the virtual address of the data symbol - writeU8(os, opcode_ptr_const, "CONST"); - writeSleb128(os, d->getVA(), "offset"); + writePtrConst(os, d->getVA(), is64, "offset"); } else if (auto *f = dyn_cast<FunctionSymbol>(sym)) { if (f->isStub) continue; @@ -462,8 +459,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { writeUleb128(os, ctx.sym.tableBase->getGlobalIndex(), "__table_base"); // Add the table index to __table_base - writeU8(os, opcode_ptr_const, "CONST"); - writeSleb128(os, f->getTableIndex(), "offset"); + writePtrConst(os, f->getTableIndex(), is64, "offset"); } else { assert(isa<UndefinedData>(sym) || isa<SharedData>(sym)); continue; From 562e3bfcd45cddc1da133780b401564471c8c66d Mon Sep 17 00:00:00 2001 From: YongKang Zhu <yongzhu@fb.com> Date: Mon, 3 Nov 2025 14:44:05 -0800 Subject: [PATCH 098/313] [BOLT] Add an option for constant island cloning (#165778) Avoid cloning constant island helps to reduce app size, especially for BOLT optimization in which cloning would happen when a function is split into multiple fragments. Add an option to make the cloning optional, and we will introduce a new pass to handle the reference too far error that may result from disabling constant island cloning (#165787). --- bolt/lib/Core/BinaryContext.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index c7cd034a30410..7af32c8c56635 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -78,6 +78,11 @@ cl::opt<std::string> CompDirOverride( "to *.dwo files."), cl::Hidden, cl::init(""), cl::cat(BoltCategory)); +static cl::opt<bool> CloneConstantIsland("clone-constant-island", + cl::desc("clone constant islands"), + cl::Hidden, cl::init(true), + cl::ZeroOrMore, cl::cat(BoltCategory)); + static cl::opt<bool> FailOnInvalidPadding("fail-on-invalid-padding", cl::Hidden, cl::init(false), cl::desc("treat invalid code padding as error"), @@ -461,7 +466,8 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF, // of dynamic relocs, as we currently do not support cloning them. // Notice: we might fail to link because of this, if the original constant // island we are referring would be emitted too far away. - if (IslandIter->second->hasDynamicRelocationAtIsland()) { + if (IslandIter->second->hasDynamicRelocationAtIsland() || + !opts::CloneConstantIsland) { MCSymbol *IslandSym = IslandIter->second->getOrCreateIslandAccess(Address); if (IslandSym) @@ -469,6 +475,12 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF, } else if (MCSymbol *IslandSym = IslandIter->second->getOrCreateProxyIslandAccess(Address, BF)) { + LLVM_DEBUG( + dbgs() << "BOLT-DEBUG: clone constant island at address 0x" + << Twine::utohexstr(IslandIter->first) << " with size of 0x" + << Twine::utohexstr( + IslandIter->second->estimateConstantIslandSize()) + << " bytes, referenced by " << BF << "\n"); BF.createIslandDependency(IslandSym, IslandIter->second); return std::make_pair(IslandSym, 0); } From 475c632b17a9f9f9ae9428c7621687e255710b7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com> Date: Mon, 3 Nov 2025 12:47:50 -1000 Subject: [PATCH 099/313] [flang][cuda] Use local scope to avoid duplicate definition (#166249) --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 23 +++++++++++-------- flang/test/Lower/CUDA/cuda-device-proc.cuf | 4 ++-- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 6be4d9ce0a46c..2db0606d2bc9e 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -3433,13 +3433,15 @@ IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, builder.setInsertionPointToStart(afterBlock); auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); - mlir::Value ret = - mlir::NVVM::InlinePtxOp::create( - builder, loc, {resultType}, {barrier, args[1], ns}, {}, - ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " - "selp.b32 %0, 1, 0, p;", - {}) - .getResult(0); + mlir::Value ret = mlir::NVVM::InlinePtxOp::create( + builder, loc, {resultType}, {barrier, args[1], ns}, {}, + "{\n" + " .reg .pred p;\n" + " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" + " selp.b32 %0, 1, 0, p;\n" + "}", + {}) + .getResult(0); mlir::scf::YieldOp::create(builder, loc, ret); builder.setInsertionPointAfter(whileOp); return whileOp.getResult(0); @@ -3454,8 +3456,11 @@ IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); return mlir::NVVM::InlinePtxOp::create( builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, - ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " - "selp.b32 %0, 1, 0, p;", + "{\n" + " .reg .pred p;\n" + " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" + " selp.b32 %0, 1, 0, p;\n" + "}", {}) .getResult(0); } diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 674548b7489e8..ed015df263070 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -519,7 +519,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait() ! CHECK: scf.while -! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A .reg .pred p;\0A mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 attributes(global) subroutine test_barrier_try_wait_sleep() integer :: istat @@ -530,7 +530,7 @@ attributes(global) subroutine test_barrier_try_wait_sleep() end subroutine ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep() -! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A .reg .pred p;\0A mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 attributes(global) subroutine test_tma_bulk_load_c4(a, n) integer(8), shared :: barrier1 From 2a42a85f5bf8202f942a7d638c1abde6dbba0db6 Mon Sep 17 00:00:00 2001 From: Stefan Mada <smada@nvidia.com> Date: Mon, 3 Nov 2025 15:02:59 -0800 Subject: [PATCH 100/313] [MLIR][NVVM] Add support for Convert Ops with rs-rounding mode (#165736) Added NVVM dialect operations for stochastic rounding (.rs) conversions from F32 to various packed floating-point formats. These operations map to existing PTX instructions and LLVM intrinsics. Supported conversions: - F32x2 to F16x2/BF16x2 (with optional relu and satfinite modifiers) - F32x4 to packed F8 formats (E4M3, E5M2) - F32x4 to packed F6 formats (E2M3, E3M2) - F32x4 to packed F4 format (E2M1) All operations support stochastic rounding with randomness provided via an rbits parameter, and optional relu and saturation modifiers. --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 93 ++++++++- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 132 +++++++++++++ .../invalid-convert-stochastic-rounding.mlir | 90 +++++++++ .../nvvm/convert_stochastic_rounding.mlir | 182 ++++++++++++++++++ 4 files changed, 496 insertions(+), 1 deletion(-) create mode 100644 mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index ba5e48e4ec9ba..46fdf5441bc13 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -1589,10 +1589,11 @@ def FPRoundingModeRM : I32EnumAttrCase<"RM", 2, "rm">; def FPRoundingModeRP : I32EnumAttrCase<"RP", 3, "rp">; def FPRoundingModeRZ : I32EnumAttrCase<"RZ", 4, "rz">; def FPRoundingModeRNA : I32EnumAttrCase<"RNA", 5, "rna">; +def FPRoundingModeRS : I32EnumAttrCase<"RS", 6, "rs">; def FPRoundingMode : I32EnumAttr<"FPRoundingMode", "NVVM FPRoundingMode kind", [FPRoundingModeNone, FPRoundingModeRN, FPRoundingModeRM, - FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA]> { + FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA, FPRoundingModeRS]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::NVVM"; } @@ -1906,6 +1907,96 @@ def NVVM_ConvertF6x2ToF16x2Op : def NVVM_ConvertF4x2ToF16x2Op : NVVM_ConvertToFP16x2Op_Base<"F4", I8, "F16">; +//===----------------------------------------------------------------------===// +// NVVM Stochastic Rounding Conversion Ops +//===----------------------------------------------------------------------===// + +// Base class for conversions from F32x2 to FPx2 formats +// (F16x2, BF16x2) +// TODO: In separate PR, add .rn and .rz rounding variants for this conversion +// as currently only support .rs rounding mode +class NVVM_ConvertF32x2ToFPx2OpBase<string dstFormat, string mnemonic, Type dstType> : + NVVM_Op<mnemonic, [Pure, NVVMRequiresSMa<[100, 103]>]>, + Results<(outs dstType:$dst)>, + Arguments<(ins F32:$src_hi, F32:$src_lo, I32:$rbits, + DefaultValuedAttr<FPRoundingModeAttr, "FPRoundingMode::RS">:$rnd, + DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat, + DefaultValuedAttr<BoolAttr, "false">:$relu)> { + let summary = "Convert two F32 values to packed " # dstFormat # " with stochastic rounding (.rs)"; + let description = [{ + Converts two F32 values to packed }] # dstFormat # [{ format using stochastic + rounding (.rs) mode with randomness provided by the `rbits` parameter. The + `relu` attribute clamps negative results to 0. The `sat` attribute determines + saturation behavior. The `src_hi` and `src_lo` parameters correspond to operands + `a` and `b` in the PTX ISA, respectively. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt) + }]; + + let assemblyFormat = "$src_hi `,` $src_lo `,` $rbits attr-dict `:` type($dst)"; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + llvm::Intrinsic::ID getIntrinsicID(); + }]; + + string llvmBuilder = [{ + auto intId = op.getIntrinsicID(); + $dst = createIntrinsicCall(builder, intId, {$src_hi, $src_lo, $rbits}); + }]; + } + +// F32x2 -> F16x2 with stochastic rounding +def NVVM_ConvertF32x2ToF16x2Op : NVVM_ConvertF32x2ToFPx2OpBase<"f16x2", "convert.f32x2.to.f16x2", VectorOfLengthAndType<[2], [F16]>>; + +// F32x2 -> BF16x2 with stochastic rounding +def NVVM_ConvertF32x2ToBF16x2Op : NVVM_ConvertF32x2ToFPx2OpBase<"bf16x2", "convert.f32x2.to.bf16x2", VectorOfLengthAndType<[2], [BF16]>>; + +// Base class for stochastic rounding conversions from F32x4 to FPx4 formats +// (E4M3x4, E5M2x4, E2M3x4, E3M2x4, E2M1x4) +// These operations always use RS (stochastic rounding) mode with SATFINITE saturation. +class NVVM_ConvertF32x4ToFPx4OpBase<string dstFormat, string mnemonic, Type dstType> : + NVVM_Op<mnemonic, [Pure, NVVMRequiresSMa<[100, 103]>]>, + Results<(outs dstType:$dst)>, + Arguments<(ins VectorOfLengthAndType<[4], [F32]>:$src, I32:$rbits, + DefaultValuedAttr<BoolAttr, "false">:$relu, + TypeAttr:$dstTy)> { + let summary = "Convert vector<4xf32> to packed " # dstFormat # " with stochastic rounding (.rs) and satfinite"; + let description = [{ + Converts a vector<4xf32> to packed }] # dstFormat # [{ format using + stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is + provided by the `rbits` parameter. The `dstTy` attribute specifies the + target floating-point format. The `relu` attribute clamps negative results to 0. + + Note: These operations always use RS rounding mode and SATFINITE saturation mode. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt) + }]; + + let assemblyFormat = "$src `,` $rbits attr-dict `:` type($src) `->` type($dst) `(` $dstTy `)`"; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + llvm::Intrinsic::ID getIntrinsicID(); + }]; + + string llvmBuilder = [{ + auto intId = op.getIntrinsicID(); + $dst = createIntrinsicCall(builder, intId, {$src, $rbits}); + }]; +} + +// F32x4 -> F8x4 with stochastic rounding (supports E4M3FN, E5M2) +def NVVM_ConvertF32x4ToF8x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f8x4", "convert.f32x4.to.f8x4", VectorOfLengthAndType<[4], [I8]>>; + +// F32x4 -> F6x4 with stochastic rounding (supports E2M3FN, E3M2FN) +def NVVM_ConvertF32x4ToF6x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f6x4", "convert.f32x4.to.f6x4", VectorOfLengthAndType<[4], [I8]>>; + +// F32x4 -> F4x4 with stochastic rounding (supports E2M1FN) +def NVVM_ConvertF32x4ToF4x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f4x4", "convert.f32x4.to.f4x4", I16>; + //===----------------------------------------------------------------------===// // NVVM MMA Ops //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index a5ffb9e77fa9d..12c81629d7e76 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -365,6 +365,59 @@ LogicalResult ConvertF4x2ToF16x2Op::verify() { return success(); } +//===----------------------------------------------------------------------===// +// Stochastic Rounding Conversion Ops +//===----------------------------------------------------------------------===// + +LogicalResult ConvertF32x2ToF16x2Op::verify() { + if (getRnd() != FPRoundingMode::RS) + return emitOpError("Only RS rounding mode is supported for " + "conversions from f32x2 to f16x2."); + return success(); +} + +LogicalResult ConvertF32x2ToBF16x2Op::verify() { + if (getRnd() != FPRoundingMode::RS) + return emitOpError("Only RS rounding mode is supported for " + "conversions from f32x2 to bf16x2."); + return success(); +} + +LogicalResult ConvertF32x4ToF8x4Op::verify() { + mlir::MLIRContext *ctx = getContext(); + + if (!llvm::isa<mlir::Float8E4M3FNType, mlir::Float8E5M2Type>(getDstTy())) + return emitOpError("Only ") + << mlir::Float8E4M3FNType::get(ctx) << " and " + << mlir::Float8E5M2Type::get(ctx) + << " types are supported for conversions from f32x4 to f8x4."; + + return success(); +} + +LogicalResult ConvertF32x4ToF6x4Op::verify() { + mlir::MLIRContext *ctx = getContext(); + + if (!llvm::isa<mlir::Float6E2M3FNType, mlir::Float6E3M2FNType>(getDstTy())) + return emitOpError("Only ") + << mlir::Float6E2M3FNType::get(ctx) << " and " + << mlir::Float6E3M2FNType::get(ctx) + << " types are supported for conversions from f32x4 to f6x4."; + + return success(); +} + +LogicalResult ConvertF32x4ToF4x4Op::verify() { + mlir::MLIRContext *ctx = getContext(); + + if (!llvm::isa<mlir::Float4E2M1FNType>(getDstTy())) + return emitOpError("Only ") << mlir::Float4E2M1FNType::get(ctx) + << " type is supported for conversions from " + "f32x4 to f4x4."; + + return success(); +} + LogicalResult BulkStoreOp::verify() { if (getInitVal() != 0) return emitOpError("only 0 is supported for initVal, got ") << getInitVal(); @@ -2469,6 +2522,85 @@ Tcgen05CommitOp::getIntrinsicIDAndArgs(Operation &op, return TCGEN05_CP_2CTA(shape_mc, , is_2cta); \ }() +llvm::Intrinsic::ID ConvertF32x2ToF16x2Op::getIntrinsicID() { + bool hasRelu = getRelu(); + bool hasSatFinite = (getSat() == NVVM::SaturationMode::SATFINITE); + + if (hasRelu && hasSatFinite) + return llvm::Intrinsic::nvvm_ff2f16x2_rs_relu_satfinite; + if (hasRelu) + return llvm::Intrinsic::nvvm_ff2f16x2_rs_relu; + if (hasSatFinite) + return llvm::Intrinsic::nvvm_ff2f16x2_rs_satfinite; + return llvm::Intrinsic::nvvm_ff2f16x2_rs; +} + +llvm::Intrinsic::ID ConvertF32x2ToBF16x2Op::getIntrinsicID() { + bool hasRelu = getRelu(); + bool hasSatFinite = (getSat() == NVVM::SaturationMode::SATFINITE); + + if (hasRelu && hasSatFinite) + return llvm::Intrinsic::nvvm_ff2bf16x2_rs_relu_satfinite; + if (hasRelu) + return llvm::Intrinsic::nvvm_ff2bf16x2_rs_relu; + if (hasSatFinite) + return llvm::Intrinsic::nvvm_ff2bf16x2_rs_satfinite; + return llvm::Intrinsic::nvvm_ff2bf16x2_rs; +} + +llvm::Intrinsic::ID ConvertF32x4ToF8x4Op::getIntrinsicID() { + mlir::Type dstTy = getDstTy(); + bool hasRelu = getRelu(); + + return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy) + .Case<mlir::Float8E4M3FNType>([&](mlir::Float8E4M3FNType) { + return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite + : llvm::Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite; + }) + .Case<mlir::Float8E5M2Type>([&](mlir::Float8E5M2Type) { + return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite + : llvm::Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite; + }) + .Default([](mlir::Type) { + llvm_unreachable("Invalid F8 type in ConvertF32x4ToF8x4Op"); + return llvm::Intrinsic::not_intrinsic; + }); +} + +llvm::Intrinsic::ID ConvertF32x4ToF6x4Op::getIntrinsicID() { + mlir::Type dstTy = getDstTy(); + bool hasRelu = getRelu(); + + return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy) + .Case<mlir::Float6E2M3FNType>([&](mlir::Float6E2M3FNType) { + return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite + : llvm::Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite; + }) + .Case<mlir::Float6E3M2FNType>([&](mlir::Float6E3M2FNType) { + return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite + : llvm::Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite; + }) + .Default([](mlir::Type) { + llvm_unreachable("Invalid F6 type in ConvertF32x4ToF6x4Op"); + return llvm::Intrinsic::not_intrinsic; + }); +} + +llvm::Intrinsic::ID ConvertF32x4ToF4x4Op::getIntrinsicID() { + mlir::Type dstTy = getDstTy(); + bool hasRelu = getRelu(); + + return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy) + .Case<mlir::Float4E2M1FNType>([&](mlir::Float4E2M1FNType) { + return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite + : llvm::Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite; + }) + .Default([](mlir::Type) { + llvm_unreachable("Invalid F4 type in ConvertF32x4ToF4x4Op"); + return llvm::Intrinsic::not_intrinsic; + }); +} + llvm::Intrinsic::ID Tcgen05CpOp::getIntrinsicID(Operation &op) { auto curOp = cast<NVVM::Tcgen05CpOp>(op); bool is2CTA = curOp.getGroup() == CTAGroupKind::CTA_2; diff --git a/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir b/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir new file mode 100644 index 0000000000000..35f5e1b3c8ba2 --- /dev/null +++ b/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir @@ -0,0 +1,90 @@ +// RUN: mlir-opt %s -split-input-file -verify-diagnostics + +// Test invalid target architecture (sm_100 instead of sm_100a) +gpu.module @invalid_arch_sm_100 [#nvvm.target<chip = "sm_100">] { + func.func @convert_rs() { + %f1 = llvm.mlir.constant(1.0 : f32) : f32 + %f2 = llvm.mlir.constant(2.0 : f32) : f32 + %rbits = llvm.mlir.constant(0x12345678 : i32) : i32 + // expected-error@+1 {{'nvvm.convert.f32x2.to.f16x2' op is not supported on sm_100}} + %res = nvvm.convert.f32x2.to.f16x2 %f1, %f2, %rbits : vector<2xf16> + return + } +} + +// ----- + +// Test that operations require stochastic rounding mode +llvm.func @invalid_rnd_mode_f16x2(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> { + // expected-error@+1 {{Only RS rounding mode is supported for conversions from f32x2 to f16x2.}} + %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xf16> + llvm.return %res : vector<2xf16> +} + +// ----- + +llvm.func @invalid_rnd_mode_bf16x2(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> { + // expected-error@+1 {{Only RS rounding mode is supported for conversions from f32x2 to bf16x2.}} + %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> + llvm.return %res : vector<2xbf16> +} + +// ----- + +// Test invalid destination types for f8x4 (should only accept f8E4M3FN, f8E5M2) +llvm.func @invalid_dst_type_f8x4_e3m4(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // expected-error@+1 {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f32x4 to f8x4.}} + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E3M4) + llvm.return %res : vector<4xi8> +} + +// ----- + +llvm.func @invalid_dst_type_f8x4_e8m0(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // expected-error@+1 {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f32x4 to f8x4.}} + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E8M0FNU) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test invalid destination types for f6x4 (should only accept f6E2M3FN, f6E3M2FN) +llvm.func @invalid_dst_type_f6x4_f8(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // expected-error@+1 {{Only 'f6E2M3FN' and 'f6E3M2FN' types are supported for conversions from f32x4 to f6x4.}} + %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E4M3FN) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test invalid destination type for f4x4 (should only accept f4E2M1FN) +llvm.func @invalid_dst_type_f4x4_f6(%src : vector<4xf32>, %rbits : i32) -> i16 { + // expected-error@+1 {{Only 'f4E2M1FN' type is supported for conversions from f32x4 to f4x4.}} + %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits : vector<4xf32> -> i16 (f6E2M3FN) + llvm.return %res : i16 +} + +// ----- + +// Test invalid rounding modes for non-stochastic ops +llvm.func @convert_float_to_tf32_rs_not_supported(%src : f32) -> i32 { + // expected-error @below {{Only {rn,rz,rna} rounding modes supported for ConvertFloatToTF32Op.}} + %res = nvvm.convert.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rs>} + llvm.return %res : i32 +} + +// ----- + +llvm.func @convert_f32x2_to_f8x2_rs_not_supported(%a : f32, %b : f32) { + // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}} + %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rs>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN) + llvm.return +} + +// ----- + +llvm.func @convert_bf16x2_to_f8x2_rs_not_supported(%src : vector<2xbf16>) { + // expected-error @below {{Only RZ and RP rounding modes are supported for conversions from bf16x2 to f8x2.}} + %res = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rs>} : vector<2xbf16> -> i16 (f8E8M0FNU) + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir new file mode 100644 index 0000000000000..b5bb22350dcd7 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir @@ -0,0 +1,182 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// ----- + +// Test valid architectures work + +// Valid case on sm_100a +gpu.module @valid_f16x2_rs_sm_100a [#nvvm.target<chip = "sm_100a">] { + func.func @convert_rs() { + %f1 = llvm.mlir.constant(1.0 : f32) : f32 + %f2 = llvm.mlir.constant(2.0 : f32) : f32 + %rbits = llvm.mlir.constant(0x12345678 : i32) : i32 + %res = nvvm.convert.f32x2.to.f16x2 %f1, %f2, %rbits : vector<2xf16> + return + } +} + +// Valid case on sm_103a +gpu.module @valid_bf16x2_rs_sm_103a [#nvvm.target<chip = "sm_103a">] { + func.func @convert_rs() { + %f1 = llvm.mlir.constant(1.0 : f32) : f32 + %f2 = llvm.mlir.constant(2.0 : f32) : f32 + %rbits = llvm.mlir.constant(0 : i32) : i32 + %res = nvvm.convert.f32x2.to.bf16x2 %f1, %f2, %rbits : vector<2xbf16> + return + } +} + +// ----- + +// Test F32x2 -> F16x2 with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x2_to_f16x2_rs +llvm.func @convert_f32x2_to_f16x2_rs(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> { + // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits : vector<2xf16> + llvm.return %res : vector<2xf16> +} + +// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_satfinite +llvm.func @convert_f32x2_to_f16x2_rs_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> { + // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {sat = #nvvm.sat_mode<satfinite>} : vector<2xf16> + llvm.return %res : vector<2xf16> +} + +// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_relu +llvm.func @convert_f32x2_to_f16x2_rs_relu(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> { + // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {relu = true} : vector<2xf16> + llvm.return %res : vector<2xf16> +} + +// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_relu_satfinite +llvm.func @convert_f32x2_to_f16x2_rs_relu_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> { + // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {relu = true, sat = #nvvm.sat_mode<satfinite>} : vector<2xf16> + llvm.return %res : vector<2xf16> +} + +// ----- + +// Test F32x2 -> BF16x2 with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs +llvm.func @convert_f32x2_to_bf16x2_rs(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> { + // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits : vector<2xbf16> + llvm.return %res : vector<2xbf16> +} + +// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_satfinite +llvm.func @convert_f32x2_to_bf16x2_rs_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> { + // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> + llvm.return %res : vector<2xbf16> +} + +// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_relu +llvm.func @convert_f32x2_to_bf16x2_rs_relu(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> { + // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {relu = true} : vector<2xbf16> + llvm.return %res : vector<2xbf16> +} + +// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_relu_satfinite +llvm.func @convert_f32x2_to_bf16x2_rs_relu_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> { + // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {relu = true, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> + llvm.return %res : vector<2xbf16> +} + +// ----- + +// Test F32x4 -> F8x4 (E4M3) with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x4_to_f8x4_e4m3_rs +llvm.func @convert_f32x4_to_f8x4_e4m3_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E4M3FN) + llvm.return %res : vector<4xi8> +} + +// CHECK-LABEL: @convert_f32x4_to_f8x4_e4m3_rs_relu +llvm.func @convert_f32x4_to_f8x4_e4m3_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f8E4M3FN) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test F32x4 -> F8x4 (E5M2) with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x4_to_f8x4_e5m2_rs +llvm.func @convert_f32x4_to_f8x4_e5m2_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E5M2) + llvm.return %res : vector<4xi8> +} + +// CHECK-LABEL: @convert_f32x4_to_f8x4_e5m2_rs_relu +llvm.func @convert_f32x4_to_f8x4_e5m2_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f8E5M2) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test F32x4 -> F6x4 (E2M3) with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x4_to_f6x4_e2m3_rs +llvm.func @convert_f32x4_to_f6x4_e2m3_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f6E2M3FN) + llvm.return %res : vector<4xi8> +} + +// CHECK-LABEL: @convert_f32x4_to_f6x4_e2m3_rs_relu +llvm.func @convert_f32x4_to_f6x4_e2m3_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f6E2M3FN) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test F32x4 -> F6x4 (E3M2) with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x4_to_f6x4_e3m2_rs +llvm.func @convert_f32x4_to_f6x4_e3m2_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f6E3M2FN) + llvm.return %res : vector<4xi8> +} + +// CHECK-LABEL: @convert_f32x4_to_f6x4_e3m2_rs_relu +llvm.func @convert_f32x4_to_f6x4_e3m2_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f6E3M2FN) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test F32x4 -> F4x4 (E2M1) with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x4_to_f4x4_e2m1_rs +llvm.func @convert_f32x4_to_f4x4_e2m1_rs(%src : vector<4xf32>, %rbits : i32) -> i16 { + // CHECK: %{{.*}} = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits : vector<4xf32> -> i16 (f4E2M1FN) + llvm.return %res : i16 +} + +// CHECK-LABEL: @convert_f32x4_to_f4x4_e2m1_rs_relu +llvm.func @convert_f32x4_to_f4x4_e2m1_rs_relu(%src : vector<4xf32>, %rbits : i32) -> i16 { + // CHECK: %{{.*}} = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits {relu = true} : vector<4xf32> -> i16 (f4E2M1FN) + llvm.return %res : i16 +} + From 615299934489953deaf202cc445ac9f8ad362afc Mon Sep 17 00:00:00 2001 From: Jacek Caban <jacek@codeweavers.com> Date: Tue, 4 Nov 2025 00:04:36 +0100 Subject: [PATCH 101/313] [CodeGen][ARM64EC] Don't treat guest exit thunks as indirect calls (#165885) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Guest exit thunks serve as glue for performing direct calls, so they shouldn’t treat the target as an indirect one. Spotted by @coneco-cy in #165504. --- .../AArch64/AArch64Arm64ECCallLowering.cpp | 14 ++---- llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll | 49 +++++++++++++++++-- 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 1169f26a2ae37..97298f9d74171 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -655,16 +655,10 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) { BasicBlock *BB = BasicBlock::Create(M->getContext(), "", GuestExit); IRBuilder<> B(BB); - // Load the global symbol as a pointer to the check function. - Value *GuardFn; - if (cfguard_module_flag == 2 && !F->hasFnAttribute("guard_nocf")) - GuardFn = GuardFnCFGlobal; - else - GuardFn = GuardFnGlobal; - LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFn); - - // Create new call instruction. The CFGuard check should always be a call, - // even if the original CallBase is an Invoke or CallBr instruction. + // Create new call instruction. The call check should always be a call, + // even if the original CallBase is an Invoke or CallBr instructio. + // This is treated as a direct call, so do not use GuardFnCFGlobal. + LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFnGlobal); Function *Thunk = buildExitThunk(F->getFunctionType(), F->getAttributes()); CallInst *GuardCheck = B.CreateCall( GuardFnType, GuardCheckLoad, {F, Thunk}); diff --git a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll index bdbc99e2d98b0..75e7ac902274d 100644 --- a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll +++ b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll @@ -2,15 +2,58 @@ declare void @called() declare void @escaped() -define void @f(ptr %dst) { +define void @f(ptr %dst, ptr readonly %f) { call void @called() +; CHECK: bl "#called" store ptr @escaped, ptr %dst - ret void + call void %f() +; CHECK: adrp x10, $iexit_thunk$cdecl$v$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$v +; CHECK-NEXT: str x8, [x20] +; CHECK-NEXT: adrp x8, __os_arm64x_check_icall_cfg +; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall_cfg] +; CHECK-NEXT: mov x11, +; CHECK-NEXT: blr x8 +; CHECK-NEXT: blr x11 + ret void } +; CHECK-LABEL: .def "#called$exit_thunk"; +; CHECK-NEXT: .scl 2; +; CHECK-NEXT: .type 32; +; CHECK-NEXT: .endef +; CHECK-NEXT: .section .wowthk$aa,"xr",discard,"#called$exit_thunk" +; CHECK-NEXT: .globl "#called$exit_thunk" // -- Begin function #called$exit_thunk +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: "#called$exit_thunk": // @"#called$exit_thunk" +; CHECK-NEXT: .weak_anti_dep called +; CHECK-NEXT: called = "#called" +; CHECK-NEXT: .weak_anti_dep "#called" +; CHECK-NEXT: "#called" = "#called$exit_thunk" +; CHECK-NEXT: .seh_proc "#called$exit_thunk" +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .seh_save_reg_x x30, 16 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: adrp x8, __os_arm64x_check_icall +; CHECK-NEXT: adrp x11, called +; CHECK-NEXT: add x11, x11, :lo12:called +; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$v$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$v +; CHECK-NEXT: blr x8 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: br x11 +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc + !llvm.module.flags = !{!0} -!0 = !{i32 2, !"cfguard", i32 1} +!0 = !{i32 2, !"cfguard", i32 2} ; CHECK-LABEL: .section .gfids$y,"dr" ; CHECK-NEXT: .symidx escaped +; CHECK-NEXT: .symidx $iexit_thunk$cdecl$v$v ; CHECK-NOT: .symidx From 73ef4dd48fa3ef64a0f291cb822ab66289780b0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com> Date: Mon, 3 Nov 2025 13:11:10 -1000 Subject: [PATCH 102/313] [flang][cuda] Add missing semi-colon in inlined ptx (#166254) This would trigger error in ptxas. --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 4 ++-- flang/test/Lower/CUDA/cuda-device-proc.cuf | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 2db0606d2bc9e..6ebd52dcd42ea 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -9501,7 +9501,7 @@ void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) { builder, loc, dst, src, fir::getBase(args[2]), {}, {}); mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, - "cp.async.bulk.commit_group", {}); + "cp.async.bulk.commit_group;", {}); mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, builder.getI32IntegerAttr(0), {}); } @@ -9517,7 +9517,7 @@ static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc, mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src, size, {}, {}); mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, - "cp.async.bulk.commit_group", {}); + "cp.async.bulk.commit_group;", {}); mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, builder.getI32IntegerAttr(0), {}); } diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index ed015df263070..666c394ad6678 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -494,7 +494,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_bulk_s2g ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(device) subroutine testAtomicCasLoop(aa, n) @@ -675,7 +675,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_c8(c, n) @@ -688,7 +688,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_i4(c, n) @@ -701,7 +701,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_i8(c, n) @@ -714,7 +714,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 @@ -728,7 +728,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_r4(c, n) @@ -741,7 +741,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_r8(c, n) @@ -754,5 +754,5 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 From 285b57b1a68e6738dac047f5f6461f231448b0f5 Mon Sep 17 00:00:00 2001 From: Rafael Auler <rafaelauler@meta.com> Date: Mon, 3 Nov 2025 15:11:29 -0800 Subject: [PATCH 103/313] Update BOLT's README.md example optimization flag (#166251) Drop hfsort in favor of a more modern function reordering algorithm. --- bolt/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/README.md b/bolt/README.md index 902d1eb6e7694..55f742c5019f5 100644 --- a/bolt/README.md +++ b/bolt/README.md @@ -173,7 +173,7 @@ Once you have `perf.fdata` ready, you can use it for optimizations with BOLT. Assuming your environment is setup to include the right path, execute `llvm-bolt`: ``` -$ llvm-bolt <executable> -o <executable>.bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=hfsort -split-functions -split-all-cold -split-eh -dyno-stats +$ llvm-bolt <executable> -o <executable>.bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions -split-all-cold -split-eh -dyno-stats ``` If you do need an updated debug info, then add `-update-debug-sections` option From d4c41b7fa30be06b5250c0d5abc7a26a83420321 Mon Sep 17 00:00:00 2001 From: Matthias Springer <me@m-sp.org> Date: Tue, 4 Nov 2025 08:34:52 +0900 Subject: [PATCH 104/313] [mlir][Transforms] Dialect Conversion: Convert entry block only (#165180) When converting a function, convert only the entry block signature. The remaining block signatures should be converted by the respective branching ops. The `FuncToLLVM` / `ControlFlowToLLVM` patterns already use that design. ```c++ struct BranchOpLowering : public ConvertOpToLLVMPattern<cf::BranchOp> { LogicalResult matchAndRewrite(cf::BranchOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // Convert successor block. SmallVector<Value> flattenedAdaptor = flattenValues(adaptor.getOperands()); FailureOr<Block *> convertedBlock = getConvertedBlock(rewriter, getTypeConverter(), op, op.getSuccessor(), TypeRange(ValueRange(flattenedAdaptor))); // ... } }; ``` This is consistent with the fact that operations from unreachable blocks are not put on the initial worklist. With this change, parent ops are no longer recursively legalized when inserting a block, simplifying the conversion driver a bit. Note for LLVM integration: If you are seeing failures, make sure to: - Drop `converter.isLegal(&op.getBody())` when checking the legality of a function op. Only the entry block signature / function type should be taken into account. - If you need to convert all reachable blocks and are using `cf` branching ops, add `populateCFStructuralTypeConversionsAndLegality`. - If you need to convert all reachable blocks and are using custom branching ops, implement and populate custom structural type conversion patterns, similar to `populateCFStructuralTypeConversionsAndLegality`. --- .../Transforms/Utils/DialectConversion.cpp | 112 ++++-------------- .../test-legalizer-no-materializations.mlir | 67 +++++++++++ mlir/test/Transforms/test-legalizer.mlir | 39 ------ mlir/test/lib/Dialect/Test/TestPatterns.cpp | 6 +- 4 files changed, 89 insertions(+), 135 deletions(-) create mode 100644 mlir/test/Transforms/test-legalizer-no-materializations.mlir diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 3a23bbfd70eac..2fe06970eb568 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -1105,10 +1105,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// A set of operations that were modified by the current pattern. SetVector<Operation *> patternModifiedOps; - /// A set of blocks that were inserted (newly-created blocks or moved blocks) - /// by the current pattern. - SetVector<Block *> patternInsertedBlocks; - /// A list of unresolved materializations that were created by the current /// pattern. DenseSet<UnrealizedConversionCastOp> patternMaterializations; @@ -2046,8 +2042,6 @@ void ConversionPatternRewriterImpl::notifyBlockInserted( if (!config.allowPatternRollback && config.listener) config.listener->notifyBlockInserted(block, previous, previousIt); - patternInsertedBlocks.insert(block); - if (wasDetached) { // If the block was detached, it is most likely a newly created block. if (config.allowPatternRollback) { @@ -2399,17 +2393,12 @@ class OperationLegalizer { bool canApplyPattern(Operation *op, const Pattern &pattern); /// Legalize the resultant IR after successfully applying the given pattern. - LogicalResult legalizePatternResult(Operation *op, const Pattern &pattern, - const RewriterState &curState, - const SetVector<Operation *> &newOps, - const SetVector<Operation *> &modifiedOps, - const SetVector<Block *> &insertedBlocks); - - /// Legalizes the actions registered during the execution of a pattern. LogicalResult - legalizePatternBlockRewrites(Operation *op, - const SetVector<Block *> &insertedBlocks, - const SetVector<Operation *> &newOps); + legalizePatternResult(Operation *op, const Pattern &pattern, + const RewriterState &curState, + const SetVector<Operation *> &newOps, + const SetVector<Operation *> &modifiedOps); + LogicalResult legalizePatternCreatedOperations(const SetVector<Operation *> &newOps); LogicalResult @@ -2608,7 +2597,6 @@ LogicalResult OperationLegalizer::legalizeWithFold(Operation *op) { auto cleanup = llvm::make_scope_exit([&]() { rewriterImpl.patternNewOps.clear(); rewriterImpl.patternModifiedOps.clear(); - rewriterImpl.patternInsertedBlocks.clear(); }); // Upon failure, undo all changes made by the folder. @@ -2662,24 +2650,16 @@ LogicalResult OperationLegalizer::legalizeWithFold(Operation *op) { static void reportNewIrLegalizationFatalError(const Pattern &pattern, const SetVector<Operation *> &newOps, - const SetVector<Operation *> &modifiedOps, - const SetVector<Block *> &insertedBlocks) { + const SetVector<Operation *> &modifiedOps) { auto newOpNames = llvm::map_range( newOps, [](Operation *op) { return op->getName().getStringRef(); }); auto modifiedOpNames = llvm::map_range( modifiedOps, [](Operation *op) { return op->getName().getStringRef(); }); - StringRef detachedBlockStr = "(detached block)"; - auto insertedBlockNames = llvm::map_range(insertedBlocks, [&](Block *block) { - if (block->getParentOp()) - return block->getParentOp()->getName().getStringRef(); - return detachedBlockStr; - }); - llvm::report_fatal_error( - "pattern '" + pattern.getDebugName() + - "' produced IR that could not be legalized. " + "new ops: {" + - llvm::join(newOpNames, ", ") + "}, " + "modified ops: {" + - llvm::join(modifiedOpNames, ", ") + "}, " + "inserted block into ops: {" + - llvm::join(insertedBlockNames, ", ") + "}"); + llvm::report_fatal_error("pattern '" + pattern.getDebugName() + + "' produced IR that could not be legalized. " + + "new ops: {" + llvm::join(newOpNames, ", ") + "}, " + + "modified ops: {" + + llvm::join(modifiedOpNames, ", ") + "}"); } LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) { @@ -2743,7 +2723,6 @@ LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) { } rewriterImpl.patternNewOps.clear(); rewriterImpl.patternModifiedOps.clear(); - rewriterImpl.patternInsertedBlocks.clear(); LLVM_DEBUG({ logFailure(rewriterImpl.logger, "pattern failed to match"); if (rewriterImpl.config.notifyCallback) { @@ -2777,15 +2756,12 @@ LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) { SetVector<Operation *> newOps = moveAndReset(rewriterImpl.patternNewOps); SetVector<Operation *> modifiedOps = moveAndReset(rewriterImpl.patternModifiedOps); - SetVector<Block *> insertedBlocks = - moveAndReset(rewriterImpl.patternInsertedBlocks); - auto result = legalizePatternResult(op, pattern, curState, newOps, - modifiedOps, insertedBlocks); + auto result = + legalizePatternResult(op, pattern, curState, newOps, modifiedOps); appliedPatterns.erase(&pattern); if (failed(result)) { if (!rewriterImpl.config.allowPatternRollback) - reportNewIrLegalizationFatalError(pattern, newOps, modifiedOps, - insertedBlocks); + reportNewIrLegalizationFatalError(pattern, newOps, modifiedOps); rewriterImpl.resetState(curState, pattern.getDebugName()); } if (config.listener) @@ -2823,8 +2799,7 @@ bool OperationLegalizer::canApplyPattern(Operation *op, LogicalResult OperationLegalizer::legalizePatternResult( Operation *op, const Pattern &pattern, const RewriterState &curState, const SetVector<Operation *> &newOps, - const SetVector<Operation *> &modifiedOps, - const SetVector<Block *> &insertedBlocks) { + const SetVector<Operation *> &modifiedOps) { [[maybe_unused]] auto &impl = rewriter.getImpl(); assert(impl.pendingRootUpdates.empty() && "dangling root updates"); @@ -2843,8 +2818,7 @@ LogicalResult OperationLegalizer::legalizePatternResult( #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS // Legalize each of the actions registered during application. - if (failed(legalizePatternBlockRewrites(op, insertedBlocks, newOps)) || - failed(legalizePatternRootUpdates(modifiedOps)) || + if (failed(legalizePatternRootUpdates(modifiedOps)) || failed(legalizePatternCreatedOperations(newOps))) { return failure(); } @@ -2853,53 +2827,6 @@ LogicalResult OperationLegalizer::legalizePatternResult( return success(); } -LogicalResult OperationLegalizer::legalizePatternBlockRewrites( - Operation *op, const SetVector<Block *> &insertedBlocks, - const SetVector<Operation *> &newOps) { - ConversionPatternRewriterImpl &impl = rewriter.getImpl(); - SmallPtrSet<Operation *, 16> alreadyLegalized; - - // If the pattern moved or created any blocks, make sure the types of block - // arguments get legalized. - for (Block *block : insertedBlocks) { - if (impl.erasedBlocks.contains(block)) - continue; - - // Only check blocks outside of the current operation. - Operation *parentOp = block->getParentOp(); - if (!parentOp || parentOp == op || block->getNumArguments() == 0) - continue; - - // If the region of the block has a type converter, try to convert the block - // directly. - if (auto *converter = impl.regionToConverter.lookup(block->getParent())) { - std::optional<TypeConverter::SignatureConversion> conversion = - converter->convertBlockSignature(block); - if (!conversion) { - LLVM_DEBUG(logFailure(impl.logger, "failed to convert types of moved " - "block")); - return failure(); - } - impl.applySignatureConversion(block, converter, *conversion); - continue; - } - - // Otherwise, try to legalize the parent operation if it was not generated - // by this pattern. This is because we will attempt to legalize the parent - // operation, and blocks in regions created by this pattern will already be - // legalized later on. - if (!newOps.count(parentOp) && alreadyLegalized.insert(parentOp).second) { - if (failed(legalize(parentOp))) { - LLVM_DEBUG(logFailure( - impl.logger, "operation '{0}'({1}) became illegal after rewrite", - parentOp->getName(), parentOp)); - return failure(); - } - } - } - return success(); -} - LogicalResult OperationLegalizer::legalizePatternCreatedOperations( const SetVector<Operation *> &newOps) { for (Operation *op : newOps) { @@ -3800,10 +3727,11 @@ static LogicalResult convertFuncOpTypes(FunctionOpInterface funcOp, TypeConverter::SignatureConversion result(type.getNumInputs()); SmallVector<Type, 1> newResults; if (failed(typeConverter.convertSignatureArgs(type.getInputs(), result)) || - failed(typeConverter.convertTypes(type.getResults(), newResults)) || - failed(rewriter.convertRegionTypes(&funcOp.getFunctionBody(), - typeConverter, &result))) + failed(typeConverter.convertTypes(type.getResults(), newResults))) return failure(); + if (!funcOp.getFunctionBody().empty()) + rewriter.applySignatureConversion(&funcOp.getFunctionBody().front(), result, + &typeConverter); // Update the function signature in-place. auto newType = FunctionType::get(rewriter.getContext(), diff --git a/mlir/test/Transforms/test-legalizer-no-materializations.mlir b/mlir/test/Transforms/test-legalizer-no-materializations.mlir new file mode 100644 index 0000000000000..82dd7422b22b2 --- /dev/null +++ b/mlir/test/Transforms/test-legalizer-no-materializations.mlir @@ -0,0 +1,67 @@ +// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND + +// CHECK-LABEL: func @dropped_input_in_use +// CHECK-KIND-LABEL: func @dropped_input_in_use +func.func @dropped_input_in_use(%arg: i16, %arg2: i64) { + // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16 + // CHECK-NEXT: "work"(%[[cast]]) : (i16) + // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"} + // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16) + // expected-remark@+1 {{op 'work' is not legalizable}} + "work"(%arg) : (i16) -> () +} + +// ----- + +// CHECK-KIND-LABEL: func @test_lookup_without_converter +// CHECK-KIND: %[[producer:.*]] = "test.valid_producer"() : () -> i16 +// CHECK-KIND: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"} +// CHECK-KIND: "test.valid_consumer"(%[[cast]]) : (f64) -> () +// CHECK-KIND: "test.valid_consumer"(%[[producer]]) : (i16) -> () +func.func @test_lookup_without_converter() { + %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64) + "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> () + // Make sure that the second "replace_with_valid_consumer" lowering does not + // lookup the materialization that was created for the above op. + "test.replace_with_valid_consumer"(%0) : (i64) -> () + // expected-remark@+1 {{op 'func.return' is not legalizable}} + return +} + +// ----- + +// CHECK-LABEL: func @remap_moved_region_args +func.func @remap_moved_region_args() { + // CHECK-NEXT: return + // CHECK-NEXT: ^bb1(%[[arg0:.*]]: i64, %[[arg1:.*]]: i16, %[[arg2:.*]]: i64, %[[arg3:.*]]: f32): + // CHECK-NEXT: %[[cast1:.*]]:2 = builtin.unrealized_conversion_cast %[[arg3]] : f32 to f16, f16 + // CHECK-NEXT: %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg2]] : i64 to f64 + // CHECK-NEXT: %[[cast3:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to f64 + // CHECK-NEXT: %[[cast4:.*]] = "test.cast"(%[[cast1]]#0, %[[cast1]]#1) : (f16, f16) -> f32 + // CHECK-NEXT: "test.valid"(%[[cast3]], %[[cast2]], %[[cast4]]) : (f64, f64, f32) + "test.region"() ({ + ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32): + "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> () + }) : () -> () + // expected-remark@+1 {{op 'func.return' is not legalizable}} + return +} + +// ----- + +// CHECK-LABEL: func @remap_cloned_region_args +func.func @remap_cloned_region_args() { + // CHECK-NEXT: return + // CHECK-NEXT: ^bb1(%[[arg0:.*]]: i64, %[[arg1:.*]]: i16, %[[arg2:.*]]: i64, %[[arg3:.*]]: f32): + // CHECK-NEXT: %[[cast1:.*]]:2 = builtin.unrealized_conversion_cast %[[arg3]] : f32 to f16, f16 + // CHECK-NEXT: %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg2]] : i64 to f64 + // CHECK-NEXT: %[[cast3:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to f64 + // CHECK-NEXT: %[[cast4:.*]] = "test.cast"(%[[cast1]]#0, %[[cast1]]#1) : (f16, f16) -> f32 + // CHECK-NEXT: "test.valid"(%[[cast3]], %[[cast2]], %[[cast4]]) : (f64, f64, f32) + "test.region"() ({ + ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32): + "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> () + }) {legalizer.should_clone} : () -> () + // expected-remark@+1 {{op 'func.return' is not legalizable}} + return +} diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index 94c5bb4e93b06..7c43bb7bface0 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -1,7 +1,6 @@ // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics %s | FileCheck %s // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics -profile-actions-to=- %s | FileCheck %s --check-prefix=CHECK-PROFILER // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0" -verify-diagnostics %s | FileCheck %s -// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND // CHECK-PROFILER: "name": "pass-execution", "cat": "PERF", "ph": "B" // CHECK-PROFILER: "name": "apply-conversion", "cat": "PERF", "ph": "B" @@ -146,36 +145,6 @@ func.func @no_remap_nested() { // ----- -// CHECK-LABEL: func @remap_moved_region_args -func.func @remap_moved_region_args() { - // CHECK-NEXT: return - // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16): - // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32 - // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32) - "test.region"() ({ - ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32): - "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> () - }) : () -> () - // expected-remark@+1 {{op 'func.return' is not legalizable}} - return -} - -// ----- - -// CHECK-LABEL: func @remap_cloned_region_args -func.func @remap_cloned_region_args() { - // CHECK-NEXT: return - // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16): - // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32 - // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32) - "test.region"() ({ - ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32): - "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> () - }) {legalizer.should_clone} : () -> () - // expected-remark@+1 {{op 'func.return' is not legalizable}} - return -} - // CHECK-LABEL: func @remap_drop_region func.func @remap_drop_region() { // CHECK-NEXT: return @@ -191,12 +160,9 @@ func.func @remap_drop_region() { // ----- // CHECK-LABEL: func @dropped_input_in_use -// CHECK-KIND-LABEL: func @dropped_input_in_use func.func @dropped_input_in_use(%arg: i16, %arg2: i64) { // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16 // CHECK-NEXT: "work"(%[[cast]]) : (i16) - // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"} - // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16) // expected-remark@+1 {{op 'work' is not legalizable}} "work"(%arg) : (i16) -> () } @@ -452,11 +418,6 @@ func.func @test_multiple_1_to_n_replacement() { // CHECK: %[[cast:.*]] = "test.cast"(%[[producer]]) : (i16) -> f64 // CHECK: "test.valid_consumer"(%[[cast]]) : (f64) -> () // CHECK: "test.valid_consumer"(%[[producer]]) : (i16) -> () -// CHECK-KIND-LABEL: func @test_lookup_without_converter -// CHECK-KIND: %[[producer:.*]] = "test.valid_producer"() : () -> i16 -// CHECK-KIND: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"} -// CHECK-KIND: "test.valid_consumer"(%[[cast]]) : (f64) -> () -// CHECK-KIND: "test.valid_consumer"(%[[producer]]) : (i16) -> () func.func @test_lookup_without_converter() { %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64) "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> () diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index fd2b943ff1296..12edecc113495 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -1553,8 +1553,7 @@ struct TestLegalizePatternDriver [](Type type) { return type.isF32(); }); }); target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) { - return converter.isSignatureLegal(op.getFunctionType()) && - converter.isLegal(&op.getBody()); + return converter.isSignatureLegal(op.getFunctionType()); }); target.addDynamicallyLegalOp<func::CallOp>( [&](func::CallOp op) { return converter.isLegal(op); }); @@ -2156,8 +2155,7 @@ struct TestTypeConversionDriver recursiveType.getName() == "outer_converted_type"); }); target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) { - return converter.isSignatureLegal(op.getFunctionType()) && - converter.isLegal(&op.getBody()); + return converter.isSignatureLegal(op.getFunctionType()); }); target.addDynamicallyLegalOp<TestCastOp>([&](TestCastOp op) { // Allow casts from F64 to F32. From 590a2b0a1f3250143d0c5c8c6ab02cba2fcd46ba Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Mon, 3 Nov 2025 16:00:29 -0800 Subject: [PATCH 105/313] Revert "ARM: Remove unnecessary manual ABI lowering for sincos_stret (#166040)" (#166262) This reverts commit a522ae3ef6e13cb39e7756c151652e03a024b301. The ABI handling doesn't account for matching the C ABI, only implicit sret. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 39 +++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 3a00267395504..6b0653457cbaf 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -9869,12 +9869,32 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin()); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); // Pair of floats / doubles used to pass the result. Type *RetTy = StructType::get(ArgTy, ArgTy); auto &DL = DAG.getDataLayout(); ArgListTy Args; + bool ShouldUseSRet = getTM().isAPCS_ABI(); + SDValue SRet; + if (ShouldUseSRet) { + // Create stack object for sret. + const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); + const Align StackAlign = DL.getPrefTypeAlign(RetTy); + int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); + + ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext())); + Entry.IsSExt = false; + Entry.IsZExt = false; + Entry.IsSRet = true; + Args.push_back(Entry); + RetTy = Type::getVoidTy(*DAG.getContext()); + } + Args.emplace_back(Arg, ArgTy); StringRef LibcallName = getLibcallImplName(SincosStret); @@ -9884,10 +9904,25 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) - .setCallee(CC, RetTy, Callee, std::move(Args)); + .setCallee(CC, RetTy, Callee, std::move(Args)) + .setDiscardResult(ShouldUseSRet); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - return CallResult.first; + if (!ShouldUseSRet) + return CallResult.first; + + SDValue LoadSin = + DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); + + // Address of cos field. + SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, + DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); + SDValue LoadCos = + DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); + + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, + LoadSin.getValue(0), LoadCos.getValue(0)); } SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, From c77b614564c69c7e63363859bf9b37427614eabd Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Mon, 3 Nov 2025 16:00:54 -0800 Subject: [PATCH 106/313] ARM: Add more ABIs to llvm.sincos test (#166264) Make sure the iOS with/without sincos_stret are tested --- llvm/test/CodeGen/ARM/llvm.sincos.ll | 910 +++++++++++++++++++++------ 1 file changed, 735 insertions(+), 175 deletions(-) diff --git a/llvm/test/CodeGen/ARM/llvm.sincos.ll b/llvm/test/CodeGen/ARM/llvm.sincos.ll index 9628405df6bcb..0c2263ee9acbf 100644 --- a/llvm/test/CodeGen/ARM/llvm.sincos.ll +++ b/llvm/test/CodeGen/ARM/llvm.sincos.ll @@ -1,223 +1,783 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s +; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefix=GNU %s +; RUN: llc -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 < %s | FileCheck -check-prefix=GNUEABI %s +; RUN: llc -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-NO-STRET %s +; RUN: llc -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-WITH-STRET %s define { half, half } @test_sincos_f16(half %a) { -; CHECK-LABEL: test_sincos_f16: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp, #4] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: ldr r0, [sp] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r4, pc} +; GNU-LABEL: test_sincos_f16: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp, #4] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: ldr r0, [sp] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: mov r1, r0 +; GNU-NEXT: mov r0, r4 +; GNU-NEXT: add sp, #8 +; GNU-NEXT: pop {r4, pc} +; +; GNUEABI-LABEL: test_sincos_f16: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, lr} +; GNUEABI-NEXT: push {r4, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #4] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: ldr r0, [sp] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: mov r1, r0 +; GNUEABI-NEXT: mov r0, r4 +; GNUEABI-NEXT: add sp, sp, #8 +; GNUEABI-NEXT: pop {r4, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f16: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, lr} +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: mov r1, r0 +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: pop {r4, r5, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_f16: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {r4, r5, lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldm sp, {r0, r4} +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: mov r5, r0 +; IOS-WITH-STRET-NEXT: mov r0, r4 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, r5 +; IOS-WITH-STRET-NEXT: add sp, sp, #8 +; IOS-WITH-STRET-NEXT: pop {r4, r5, pc} %result = call { half, half } @llvm.sincos.f16(half %a) ret { half, half } %result } define half @test_sincos_f16_only_use_sin(half %a) { -; CHECK-LABEL: test_sincos_f16_only_use_sin: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp, #4] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f16_only_use_sin: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp, #4] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: add sp, #8 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f16_only_use_sin: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #4] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: add sp, sp, #8 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_sin: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {lr} +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: pop {lr} +; IOS-NO-STRET-NEXT: bx lr +; +; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_sin: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldr r0, [sp] +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: add sp, sp, #8 +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr %result = call { half, half } @llvm.sincos.f16(half %a) %result.0 = extractvalue { half, half } %result, 0 ret half %result.0 } define half @test_sincos_f16_only_use_cos(half %a) { -; CHECK-LABEL: test_sincos_f16_only_use_cos: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f16_only_use_cos: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: add sp, #8 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f16_only_use_cos: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: add sp, sp, #8 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_cos: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {lr} +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: pop {lr} +; IOS-NO-STRET-NEXT: bx lr +; +; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_cos: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldr r0, [sp, #4] +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: add sp, sp, #8 +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr %result = call { half, half } @llvm.sincos.f16(half %a) %result.1 = extractvalue { half, half } %result, 1 ret half %result.1 } define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) { -; CHECK-LABEL: test_sincos_v2f16: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #12 -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp, #12] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: ldr r1, [sp, #4] -; CHECK-NEXT: strh.w r0, [sp, #22] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: strh.w r0, [sp, #20] -; CHECK-NEXT: add r0, sp, #20 -; CHECK-NEXT: vld1.32 {d8[0]}, [r0:32] -; CHECK-NEXT: ldr r0, [sp, #8] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: ldr r1, [sp] -; CHECK-NEXT: strh.w r0, [sp, #18] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: strh.w r0, [sp, #16] -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vmovl.u16 q9, d8 -; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] -; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vmov.32 r0, d18[0] -; CHECK-NEXT: vmov.32 r1, d18[1] -; CHECK-NEXT: vmov.32 r2, d16[0] -; CHECK-NEXT: vmov.32 r3, d16[1] -; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: pop {r4, pc} +; GNU-LABEL: test_sincos_v2f16: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, lr} +; GNU-NEXT: vpush {d8} +; GNU-NEXT: sub sp, #24 +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #12 +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: bl sincosf +; GNU-NEXT: mov r0, r4 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp, #12] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: ldr r1, [sp, #4] +; GNU-NEXT: strh.w r0, [sp, #22] +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: strh.w r0, [sp, #20] +; GNU-NEXT: add r0, sp, #20 +; GNU-NEXT: vld1.32 {d8[0]}, [r0:32] +; GNU-NEXT: ldr r0, [sp, #8] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: ldr r1, [sp] +; GNU-NEXT: strh.w r0, [sp, #18] +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: strh.w r0, [sp, #16] +; GNU-NEXT: add r0, sp, #16 +; GNU-NEXT: vmovl.u16 q9, d8 +; GNU-NEXT: vld1.32 {d16[0]}, [r0:32] +; GNU-NEXT: vmovl.u16 q8, d16 +; GNU-NEXT: vmov.32 r0, d18[0] +; GNU-NEXT: vmov.32 r1, d18[1] +; GNU-NEXT: vmov.32 r2, d16[0] +; GNU-NEXT: vmov.32 r3, d16[1] +; GNU-NEXT: add sp, #24 +; GNU-NEXT: vpop {d8} +; GNU-NEXT: pop {r4, pc} +; +; GNUEABI-LABEL: test_sincos_v2f16: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, lr} +; GNUEABI-NEXT: push {r4, lr} +; GNUEABI-NEXT: .vsave {d8} +; GNUEABI-NEXT: vpush {d8} +; GNUEABI-NEXT: .pad #24 +; GNUEABI-NEXT: sub sp, sp, #24 +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #12 +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: mov r0, r4 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #12] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: ldr r1, [sp, #4] +; GNUEABI-NEXT: strh r0, [sp, #22] +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: strh r0, [sp, #20] +; GNUEABI-NEXT: add r0, sp, #20 +; GNUEABI-NEXT: vld1.32 {d8[0]}, [r0:32] +; GNUEABI-NEXT: ldr r0, [sp, #8] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: ldr r1, [sp] +; GNUEABI-NEXT: strh r0, [sp, #18] +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: strh r0, [sp, #16] +; GNUEABI-NEXT: add r0, sp, #16 +; GNUEABI-NEXT: vmovl.u16 q9, d8 +; GNUEABI-NEXT: vld1.32 {d16[0]}, [r0:32] +; GNUEABI-NEXT: vmovl.u16 q8, d16 +; GNUEABI-NEXT: vmov.32 r0, d18[0] +; GNUEABI-NEXT: vmov.32 r1, d18[1] +; GNUEABI-NEXT: vmov.32 r2, d16[0] +; GNUEABI-NEXT: vmov.32 r3, d16[1] +; GNUEABI-NEXT: add sp, sp, #24 +; GNUEABI-NEXT: vpop {d8} +; GNUEABI-NEXT: pop {r4, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_v2f16: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, lr} +; IOS-NO-STRET-NEXT: vpush {d8} +; IOS-NO-STRET-NEXT: sub sp, sp, #8 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r1 +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp, #6] +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp, #4] +; IOS-NO-STRET-NEXT: add r0, sp, #4 +; IOS-NO-STRET-NEXT: vld1.32 {d8[0]}, [r0:32] +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp, #2] +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp] +; IOS-NO-STRET-NEXT: mov r0, sp +; IOS-NO-STRET-NEXT: vld1.32 {d16[0]}, [r0:32] +; IOS-NO-STRET-NEXT: vmovl.u16 q9, d8 +; IOS-NO-STRET-NEXT: vmovl.u16 q8, d16 +; IOS-NO-STRET-NEXT: vmov.32 r0, d18[0] +; IOS-NO-STRET-NEXT: vmov.32 r1, d18[1] +; IOS-NO-STRET-NEXT: vmov.32 r2, d16[0] +; IOS-NO-STRET-NEXT: vmov.32 r3, d16[1] +; IOS-NO-STRET-NEXT: add sp, sp, #8 +; IOS-NO-STRET-NEXT: vpop {d8} +; IOS-NO-STRET-NEXT: pop {r4, r5, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_v2f16: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {r4, r5, lr} +; IOS-WITH-STRET-NEXT: vpush {d8} +; IOS-WITH-STRET-NEXT: sub sp, sp, #24 +; IOS-WITH-STRET-NEXT: mov r4, r0 +; IOS-WITH-STRET-NEXT: mov r0, r1 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: add r0, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: mov r0, r4 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldr r0, [sp, #8] +; IOS-WITH-STRET-NEXT: ldr r4, [sp, #12] +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: ldm sp, {r1, r5} +; IOS-WITH-STRET-NEXT: strh r0, [sp, #22] +; IOS-WITH-STRET-NEXT: mov r0, r1 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: strh r0, [sp, #20] +; IOS-WITH-STRET-NEXT: add r0, sp, #20 +; IOS-WITH-STRET-NEXT: vld1.32 {d8[0]}, [r0:32] +; IOS-WITH-STRET-NEXT: mov r0, r4 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: strh r0, [sp, #18] +; IOS-WITH-STRET-NEXT: mov r0, r5 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: strh r0, [sp, #16] +; IOS-WITH-STRET-NEXT: add r0, sp, #16 +; IOS-WITH-STRET-NEXT: vmovl.u16 q9, d8 +; IOS-WITH-STRET-NEXT: vld1.32 {d16[0]}, [r0:32] +; IOS-WITH-STRET-NEXT: vmovl.u16 q8, d16 +; IOS-WITH-STRET-NEXT: vmov.32 r0, d18[0] +; IOS-WITH-STRET-NEXT: vmov.32 r1, d18[1] +; IOS-WITH-STRET-NEXT: vmov.32 r2, d16[0] +; IOS-WITH-STRET-NEXT: vmov.32 r3, d16[1] +; IOS-WITH-STRET-NEXT: add sp, sp, #24 +; IOS-WITH-STRET-NEXT: vpop {d8} +; IOS-WITH-STRET-NEXT: pop {r4, r5, pc} %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a) ret { <2 x half>, <2 x half> } %result } define { float, float } @test_sincos_f32(float %a) { -; CHECK-LABEL: test_sincos_f32: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldrd r1, r0, [sp], #8 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f32: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldrd r1, r0, [sp], #8 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f32: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #4] +; GNUEABI-NEXT: ldr r1, [sp], #8 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f32: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, lr} +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: mov r1, r0 +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: pop {r4, r5, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_f32: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: pop {r0, r1} +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr %result = call { float, float } @llvm.sincos.f32(float %a) ret { float, float } %result } define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) { -; CHECK-LABEL: test_sincos_v2f32: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: add r1, sp, #12 -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: vldr s1, [sp, #4] -; CHECK-NEXT: vldr s3, [sp] -; CHECK-NEXT: vldr s0, [sp, #12] -; CHECK-NEXT: vldr s2, [sp, #8] -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_v2f32: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: vpush {d8} +; GNU-NEXT: sub sp, #16 +; GNU-NEXT: vmov d8, r0, r1 +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: vmov r0, s17 +; GNU-NEXT: bl sincosf +; GNU-NEXT: vmov r0, s16 +; GNU-NEXT: add r1, sp, #12 +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: bl sincosf +; GNU-NEXT: vldr s1, [sp, #4] +; GNU-NEXT: vldr s3, [sp] +; GNU-NEXT: vldr s0, [sp, #12] +; GNU-NEXT: vldr s2, [sp, #8] +; GNU-NEXT: vmov r0, r1, d0 +; GNU-NEXT: vmov r2, r3, d1 +; GNU-NEXT: add sp, #16 +; GNU-NEXT: vpop {d8} +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_v2f32: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .vsave {d8} +; GNUEABI-NEXT: vpush {d8} +; GNUEABI-NEXT: .pad #16 +; GNUEABI-NEXT: sub sp, sp, #16 +; GNUEABI-NEXT: vmov d8, r0, r1 +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: vmov r0, s17 +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: vmov r0, s16 +; GNUEABI-NEXT: add r1, sp, #12 +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: vldr s1, [sp, #4] +; GNUEABI-NEXT: vldr s3, [sp] +; GNUEABI-NEXT: vldr s0, [sp, #12] +; GNUEABI-NEXT: vldr s2, [sp, #8] +; GNUEABI-NEXT: vmov r0, r1, d0 +; GNUEABI-NEXT: vmov r2, r3, d1 +; GNUEABI-NEXT: add sp, sp, #16 +; GNUEABI-NEXT: vpop {d8} +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_v2f32: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, r6, r7, lr} +; IOS-NO-STRET-NEXT: vpush {d8} +; IOS-NO-STRET-NEXT: vmov d8, r0, r1 +; IOS-NO-STRET-NEXT: vmov r4, s17 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: vmov r6, s16 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: mov r7, r0 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: mov r2, r0 +; IOS-NO-STRET-NEXT: mov r0, r7 +; IOS-NO-STRET-NEXT: mov r1, r5 +; IOS-NO-STRET-NEXT: mov r3, r4 +; IOS-NO-STRET-NEXT: vpop {d8} +; IOS-NO-STRET-NEXT: pop {r4, r5, r6, r7, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_v2f32: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: vpush {d8} +; IOS-WITH-STRET-NEXT: sub sp, sp, #16 +; IOS-WITH-STRET-NEXT: vmov d8, r0, r1 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: vmov r1, s17 +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: vmov r1, s16 +; IOS-WITH-STRET-NEXT: add r0, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: vldr s1, [sp] +; IOS-WITH-STRET-NEXT: vldr s3, [sp, #4] +; IOS-WITH-STRET-NEXT: vldr s0, [sp, #8] +; IOS-WITH-STRET-NEXT: vldr s2, [sp, #12] +; IOS-WITH-STRET-NEXT: vmov r0, r1, d0 +; IOS-WITH-STRET-NEXT: vmov r2, r3, d1 +; IOS-WITH-STRET-NEXT: add sp, sp, #16 +; IOS-WITH-STRET-NEXT: vpop {d8} +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a) ret { <2 x float>, <2 x float> } %result } define { double, double } @test_sincos_f64(double %a) { -; CHECK-LABEL: test_sincos_f64: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: bl sincos -; CHECK-NEXT: ldrd r0, r1, [sp, #8] -; CHECK-NEXT: ldrd r2, r3, [sp], #16 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f64: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #16 +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: mov r3, sp +; GNU-NEXT: bl sincos +; GNU-NEXT: ldrd r0, r1, [sp, #8] +; GNU-NEXT: ldrd r2, r3, [sp], #16 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f64: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #16 +; GNUEABI-NEXT: sub sp, sp, #16 +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: mov r3, sp +; GNUEABI-NEXT: bl sincos +; GNUEABI-NEXT: ldm sp, {r2, r3} +; GNUEABI-NEXT: ldr r0, [sp, #8] +; GNUEABI-NEXT: ldr r1, [sp, #12] +; GNUEABI-NEXT: add sp, sp, #16 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f64: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, r6, r7, lr} +; IOS-NO-STRET-NEXT: mov r4, r1 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: bl _sin +; IOS-NO-STRET-NEXT: mov r6, r0 +; IOS-NO-STRET-NEXT: mov r7, r1 +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: mov r1, r4 +; IOS-NO-STRET-NEXT: bl _cos +; IOS-NO-STRET-NEXT: mov r2, r0 +; IOS-NO-STRET-NEXT: mov r3, r1 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: mov r1, r7 +; IOS-NO-STRET-NEXT: pop {r4, r5, r6, r7, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_f64: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #16 +; IOS-WITH-STRET-NEXT: mov r2, r1 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincos_stret +; IOS-WITH-STRET-NEXT: vldr d16, [sp, #8] +; IOS-WITH-STRET-NEXT: ldm sp, {r0, r1} +; IOS-WITH-STRET-NEXT: vmov r2, r3, d16 +; IOS-WITH-STRET-NEXT: add sp, sp, #16 +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr %result = call { double, double } @llvm.sincos.f64(double %a) ret { double, double } %result } define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) { -; CHECK-LABEL: test_sincos_v2f64: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: mov r1, r3 -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: add r2, sp, #24 -; CHECK-NEXT: add r3, sp, #16 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: bl sincos -; CHECK-NEXT: ldrd r0, r1, [sp, #40] -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: bl sincos -; CHECK-NEXT: vldr d19, [sp, #8] -; CHECK-NEXT: vldr d18, [sp, #24] -; CHECK-NEXT: vldr d17, [sp] -; CHECK-NEXT: vldr d16, [sp, #16] -; CHECK-NEXT: vst1.64 {d18, d19}, [r4]! -; CHECK-NEXT: vst1.64 {d16, d17}, [r4] -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: pop {r4, pc} +; GNU-LABEL: test_sincos_v2f64: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, lr} +; GNU-NEXT: sub sp, #32 +; GNU-NEXT: mov r1, r3 +; GNU-NEXT: mov r12, r2 +; GNU-NEXT: add r2, sp, #24 +; GNU-NEXT: add r3, sp, #16 +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: mov r0, r12 +; GNU-NEXT: bl sincos +; GNU-NEXT: ldrd r0, r1, [sp, #40] +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: mov r3, sp +; GNU-NEXT: bl sincos +; GNU-NEXT: vldr d19, [sp, #8] +; GNU-NEXT: vldr d18, [sp, #24] +; GNU-NEXT: vldr d17, [sp] +; GNU-NEXT: vldr d16, [sp, #16] +; GNU-NEXT: vst1.64 {d18, d19}, [r4]! +; GNU-NEXT: vst1.64 {d16, d17}, [r4] +; GNU-NEXT: add sp, #32 +; GNU-NEXT: pop {r4, pc} +; +; GNUEABI-LABEL: test_sincos_v2f64: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, lr} +; GNUEABI-NEXT: push {r4, lr} +; GNUEABI-NEXT: .pad #32 +; GNUEABI-NEXT: sub sp, sp, #32 +; GNUEABI-NEXT: mov r1, r3 +; GNUEABI-NEXT: mov r12, r2 +; GNUEABI-NEXT: add r2, sp, #24 +; GNUEABI-NEXT: add r3, sp, #16 +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: mov r0, r12 +; GNUEABI-NEXT: bl sincos +; GNUEABI-NEXT: ldr r0, [sp, #40] +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: ldr r1, [sp, #44] +; GNUEABI-NEXT: mov r3, sp +; GNUEABI-NEXT: bl sincos +; GNUEABI-NEXT: vldr d19, [sp, #8] +; GNUEABI-NEXT: vldr d18, [sp, #24] +; GNUEABI-NEXT: vldr d17, [sp] +; GNUEABI-NEXT: vldr d16, [sp, #16] +; GNUEABI-NEXT: vst1.64 {d18, d19}, [r4]! +; GNUEABI-NEXT: vst1.64 {d16, d17}, [r4] +; GNUEABI-NEXT: add sp, sp, #32 +; GNUEABI-NEXT: pop {r4, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_v2f64: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, r6, r7, r8, r10, r11, lr} +; IOS-NO-STRET-NEXT: vpush {d8, d9, d10, d11} +; IOS-NO-STRET-NEXT: ldr r8, [sp, #64] +; IOS-NO-STRET-NEXT: mov r7, r1 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: mov r0, r3 +; IOS-NO-STRET-NEXT: mov r6, r3 +; IOS-NO-STRET-NEXT: mov r10, r2 +; IOS-NO-STRET-NEXT: mov r1, r8 +; IOS-NO-STRET-NEXT: bl _sin +; IOS-NO-STRET-NEXT: mov r11, r0 +; IOS-NO-STRET-NEXT: mov r5, r1 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: mov r1, r8 +; IOS-NO-STRET-NEXT: bl _cos +; IOS-NO-STRET-NEXT: vmov d9, r0, r1 +; IOS-NO-STRET-NEXT: mov r0, r7 +; IOS-NO-STRET-NEXT: mov r1, r10 +; IOS-NO-STRET-NEXT: vmov d11, r11, r5 +; IOS-NO-STRET-NEXT: bl _sin +; IOS-NO-STRET-NEXT: vmov d10, r0, r1 +; IOS-NO-STRET-NEXT: mov r0, r7 +; IOS-NO-STRET-NEXT: mov r1, r10 +; IOS-NO-STRET-NEXT: bl _cos +; IOS-NO-STRET-NEXT: vmov d8, r0, r1 +; IOS-NO-STRET-NEXT: vst1.32 {d10, d11}, [r4]! +; IOS-NO-STRET-NEXT: vst1.32 {d8, d9}, [r4] +; IOS-NO-STRET-NEXT: vpop {d8, d9, d10, d11} +; IOS-NO-STRET-NEXT: pop {r4, r5, r6, r7, r8, r10, r11, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_v2f64: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {r4, r5, r6, lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #32 +; IOS-WITH-STRET-NEXT: mov r4, r2 +; IOS-WITH-STRET-NEXT: ldr r2, [sp, #48] +; IOS-WITH-STRET-NEXT: mov r6, r0 +; IOS-WITH-STRET-NEXT: add r0, sp, #16 +; IOS-WITH-STRET-NEXT: mov r5, r1 +; IOS-WITH-STRET-NEXT: mov r1, r3 +; IOS-WITH-STRET-NEXT: bl ___sincos_stret +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: mov r1, r5 +; IOS-WITH-STRET-NEXT: mov r2, r4 +; IOS-WITH-STRET-NEXT: bl ___sincos_stret +; IOS-WITH-STRET-NEXT: vldr d17, [sp, #16] +; IOS-WITH-STRET-NEXT: vldr d16, [sp] +; IOS-WITH-STRET-NEXT: vldr d19, [sp, #24] +; IOS-WITH-STRET-NEXT: vldr d18, [sp, #8] +; IOS-WITH-STRET-NEXT: vst1.32 {d16, d17}, [r6]! +; IOS-WITH-STRET-NEXT: vst1.32 {d18, d19}, [r6] +; IOS-WITH-STRET-NEXT: add sp, sp, #32 +; IOS-WITH-STRET-NEXT: pop {r4, r5, r6, pc} %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a) ret { <2 x double>, <2 x double> } %result } define { fp128, fp128 } @test_sincos_f128(fp128 %a) { -; CHECK-LABEL: test_sincos_f128: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: mov r12, r3 -; CHECK-NEXT: ldr r3, [sp, #56] -; CHECK-NEXT: add.w lr, sp, #8 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add r0, sp, #24 -; CHECK-NEXT: strd r0, lr, [sp] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: bl sincosl -; CHECK-NEXT: ldrd r2, r3, [sp, #16] -; CHECK-NEXT: ldrd r12, r1, [sp, #8] -; CHECK-NEXT: str r3, [r4, #28] -; CHECK-NEXT: ldrd r3, r5, [sp, #32] -; CHECK-NEXT: ldrd lr, r0, [sp, #24] -; CHECK-NEXT: strd r1, r2, [r4, #20] -; CHECK-NEXT: add.w r1, r4, #8 -; CHECK-NEXT: stm.w r1, {r3, r5, r12} -; CHECK-NEXT: strd lr, r0, [r4] -; CHECK-NEXT: add sp, #40 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; GNU-LABEL: test_sincos_f128: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, r5, r7, lr} +; GNU-NEXT: sub sp, #40 +; GNU-NEXT: mov r12, r3 +; GNU-NEXT: ldr r3, [sp, #56] +; GNU-NEXT: add.w lr, sp, #8 +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: add r0, sp, #24 +; GNU-NEXT: strd r0, lr, [sp] +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: mov r1, r2 +; GNU-NEXT: mov r2, r12 +; GNU-NEXT: bl sincosl +; GNU-NEXT: ldrd r2, r3, [sp, #16] +; GNU-NEXT: ldrd r12, r1, [sp, #8] +; GNU-NEXT: str r3, [r4, #28] +; GNU-NEXT: ldrd r3, r5, [sp, #32] +; GNU-NEXT: ldrd lr, r0, [sp, #24] +; GNU-NEXT: strd r1, r2, [r4, #20] +; GNU-NEXT: add.w r1, r4, #8 +; GNU-NEXT: stm.w r1, {r3, r5, r12} +; GNU-NEXT: strd lr, r0, [r4] +; GNU-NEXT: add sp, #40 +; GNU-NEXT: pop {r4, r5, r7, pc} +; +; GNUEABI-LABEL: test_sincos_f128: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, r5, r11, lr} +; GNUEABI-NEXT: push {r4, r5, r11, lr} +; GNUEABI-NEXT: .pad #40 +; GNUEABI-NEXT: sub sp, sp, #40 +; GNUEABI-NEXT: mov r12, r3 +; GNUEABI-NEXT: ldr r3, [sp, #56] +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: add r0, sp, #24 +; GNUEABI-NEXT: add r5, sp, #8 +; GNUEABI-NEXT: stm sp, {r0, r5} +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: mov r1, r2 +; GNUEABI-NEXT: mov r2, r12 +; GNUEABI-NEXT: bl sincosl +; GNUEABI-NEXT: add r3, sp, #12 +; GNUEABI-NEXT: ldr r12, [sp, #8] +; GNUEABI-NEXT: ldm r3, {r1, r2, r3} +; GNUEABI-NEXT: str r3, [r4, #28] +; GNUEABI-NEXT: ldr r0, [sp, #32] +; GNUEABI-NEXT: ldr lr, [sp, #24] +; GNUEABI-NEXT: ldr r5, [sp, #28] +; GNUEABI-NEXT: ldr r3, [sp, #36] +; GNUEABI-NEXT: str r2, [r4, #24] +; GNUEABI-NEXT: str r1, [r4, #20] +; GNUEABI-NEXT: add r1, r4, #8 +; GNUEABI-NEXT: stm r1, {r0, r3, r12} +; GNUEABI-NEXT: str r5, [r4, #4] +; GNUEABI-NEXT: str lr, [r4] +; GNUEABI-NEXT: add sp, sp, #40 +; GNUEABI-NEXT: pop {r4, r5, r11, pc} +; +; IOS-LABEL: test_sincos_f128: +; IOS: @ %bb.0: +; IOS-NEXT: push {r4, r5, r6, r7, r8, lr} +; IOS-NEXT: ldr r8, [sp, #24] +; IOS-NEXT: mov r4, r0 +; IOS-NEXT: mov r5, r3 +; IOS-NEXT: mov r6, r2 +; IOS-NEXT: mov r7, r1 +; IOS-NEXT: mov r0, r1 +; IOS-NEXT: mov r1, r2 +; IOS-NEXT: mov r2, r3 +; IOS-NEXT: mov r3, r8 +; IOS-NEXT: bl _cosl +; IOS-NEXT: add r9, r4, #16 +; IOS-NEXT: stm r9, {r0, r1, r2, r3} +; IOS-NEXT: mov r0, r7 +; IOS-NEXT: mov r1, r6 +; IOS-NEXT: mov r2, r5 +; IOS-NEXT: mov r3, r8 +; IOS-NEXT: bl _sinl +; IOS-NEXT: stm r4, {r0, r1, r2, r3} +; IOS-NEXT: pop {r4, r5, r6, r7, r8, pc} %result = call { fp128, fp128 } @llvm.sincos.f16(fp128 %a) ret { fp128, fp128 } %result } From f7fff18ad09680056f028a99a961d4120063c55b Mon Sep 17 00:00:00 2001 From: jeanPerier <jperier@nvidia.com> Date: Tue, 4 Nov 2025 01:02:36 +0100 Subject: [PATCH 107/313] [mlir][OpenACC] add unstructured attributes for acc.loop with early exits (#164990) "!$acc loop" directive may be placed above loops with early exits. Currently flang lowers loop with early exits to explicit control flow (this may be revisited when MLIR allows early exits in structured region). The acc loop directive cannot simply be ignored in such case in lowering because it may hold data clauses that should be applied when reaching that point. This patch adds an "unstructured" attribute to acc.loop to support that case. An acc.loop with such attributes may hold data operands but must have no controls. It is expected that the loop logic is implemented in its body in a way that the acc dialect may not understand. Such acc.loop is just a container and the loop with early exit will be executed sequentially. --- mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td | 9 ++++++++- mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 8 ++++++-- mlir/test/Dialect/OpenACC/invalid.mlir | 9 +++++++++ mlir/test/Dialect/OpenACC/ops.mlir | 14 ++++++++++++++ 4 files changed, 37 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 2f4517ddfe754..c689b7e46ea9e 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -2557,6 +2557,12 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", device-type-aware getter methods. When modifying these operands, the corresponding `device_type` attributes must be updated to maintain consistency between operands and their target device types. + + The `unstructured` attribute indicates that the loops inside the OpenACC + construct contain early exits and cannot be lowered to structured MLIR + operations. When this flag is set, the acc.loop should have no induction + variables and the loop must be implemented via explicit control flow + inside its body. }]; let arguments = (ins @@ -2590,7 +2596,8 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", OptionalAttr<SymbolRefArrayAttr>:$firstprivatizationRecipes, Variadic<AnyType>:$reductionOperands, OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes, - OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined + OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined, + UnitAttr:$unstructured ); let results = (outs Variadic<AnyType>:$results); diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 35eba724a9059..b2f1d840f3bca 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -3068,8 +3068,12 @@ LogicalResult acc::LoopOp::verify() { if (getRegion().empty()) return emitError("expected non-empty body."); - // When it is container-like - it is expected to hold a loop-like operation. - if (isContainerLike()) { + if (getUnstructured()) { + if (!isContainerLike()) + return emitError( + "unstructured acc.loop must not have induction variables"); + } else if (isContainerLike()) { + // When it is container-like - it is expected to hold a loop-like operation. // Obtain the maximum collapse count - we use this to check that there // are enough loops contained. uint64_t collapseCount = getCollapseValue().value_or(1); diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir index 26b63fbe182ea..0e75894eaeceb 100644 --- a/mlir/test/Dialect/OpenACC/invalid.mlir +++ b/mlir/test/Dialect/OpenACC/invalid.mlir @@ -492,6 +492,15 @@ func.func @fct1(%0 : !llvm.ptr) -> () { // ----- +%i1 = arith.constant 1 : i32 +%i2 = arith.constant 10 : i32 +// expected-error@+1 {{unstructured acc.loop must not have induction variables}} +acc.loop control(%iv : i32) = (%i1 : i32) to (%i2 : i32) step (%i1 : i32) { + acc.yield +} attributes {independent = [#acc.device_type<none>], unstructured} + +// ----- + // expected-error@+1 {{expect at least one of num, dim or static values}} acc.loop gang({}) { "test.openacc_dummy_op"() : () -> () diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir index 042ee2503cb95..df8ab9b7dd239 100644 --- a/mlir/test/Dialect/OpenACC/ops.mlir +++ b/mlir/test/Dialect/OpenACC/ops.mlir @@ -2143,6 +2143,20 @@ func.func @acc_loop_container() { // ----- +func.func @acc_unstructured_loop() { + acc.loop { + acc.yield + } attributes {independent = [#acc.device_type<none>], unstructured} + return +} + +// CHECK-LABEL: func.func @acc_unstructured_loop +// CHECK: acc.loop +// CHECK: acc.yield +// CHECK: } attributes {independent = [#acc.device_type<none>], unstructured} + +// ----- + // Test private recipe with data bounds for array slicing acc.private.recipe @privatization_memref_slice : memref<10x10xf32> init { ^bb0(%arg0: memref<10x10xf32>, %bounds0: !acc.data_bounds_ty, %bounds1: !acc.data_bounds_ty): From 02d93f7abbd555d93eae94c670fa0aba95758f00 Mon Sep 17 00:00:00 2001 From: barsolo2000 <barsolo@meta.com> Date: Mon, 3 Nov 2025 17:22:35 -0800 Subject: [PATCH 108/313] [RISCV] Adding `vlenb` register as callee register (#165796) In recent debug sessions we noticed that GDB debugger is showing more stack trace than lldb. After enabling unwinding log, it seems like the issue is that the CFA is dependent on the value of vlenb. vlenb doesn't change in runtime so we can assume its value from frame 0. Co-authored-by: Bar Soloveychik <barsolo@fb.com> --- lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp index ff37b48d86ca8..a5547a4699ca9 100644 --- a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp +++ b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp @@ -798,6 +798,8 @@ bool ABISysV_riscv::RegisterIsCalleeSaved(const RegisterInfo *reg_info) { .Cases({"f8", "f9", "f18", "f19", "f20", "f21", "f22", "f23"}, is_hw_fp) .Cases({"f24", "f25", "f26", "f27"}, is_hw_fp) + // vlenb is constant and needed for vector unwinding. + .Case("vlenb", true) .Default(false); return is_callee_saved; From a22d1c2225543aa9ae7882f6b1a97ee7b2c95574 Mon Sep 17 00:00:00 2001 From: Aiden Grossman <aidengrossman@google.com> Date: Tue, 4 Nov 2025 01:31:18 +0000 Subject: [PATCH 109/313] Revert "[Polly] Introduce PhaseManager and remove LPM support (#125442)" This reverts commit e987ab11a6f3d3965ef26fc42c82db3e8b1d56f5. This broke premerge: 1. https://lab.llvm.org/staging/#/builders/192/builds/9521 2. https://github.com/llvm/llvm-project/actions/runs/19054182009 Notably this did not break inside the PR. Not exactly sure why. I realize that there is a lot of test churn here, but they're largely in polly where commit frequency is much lower, so a reapply of the patch should be clean. --- polly/docs/ReleaseNotes.rst | 4 - polly/include/polly/Canonicalization.h | 8 + polly/include/polly/CodeGen/CodeGeneration.h | 3 - polly/include/polly/CodeGen/IslAst.h | 36 +- polly/include/polly/CodePreparation.h | 6 - polly/include/polly/DeLICM.h | 11 +- polly/include/polly/DeadCodeElimination.h | 13 +- polly/include/polly/DependenceInfo.h | 106 ++++- polly/include/polly/FlattenSchedule.h | 16 +- polly/include/polly/ForwardOpTree.h | 20 +- polly/include/polly/JSONExporter.h | 13 +- polly/include/polly/LinkAllPasses.h | 156 +++++++ polly/include/polly/MaximalStaticExpansion.h | 2 - polly/include/polly/Pass/PhaseManager.h | 127 ------ polly/include/polly/Pass/PollyFunctionPass.h | 32 -- polly/include/polly/Pass/PollyModulePass.h | 30 -- polly/include/polly/PruneUnprofitable.h | 12 +- polly/include/polly/RegisterPasses.h | 2 + polly/include/polly/ScheduleOptimizer.h | 16 +- polly/include/polly/ScopDetection.h | 27 ++ polly/include/polly/ScopGraphPrinter.h | 3 - polly/include/polly/ScopInfo.h | 76 ++++ polly/include/polly/ScopInliner.h | 6 + polly/include/polly/ScopPass.h | 28 ++ polly/include/polly/Simplify.h | 23 +- .../include/polly/Support/DumpFunctionPass.h | 12 + polly/include/polly/Support/DumpModulePass.h | 15 +- polly/include/polly/Support/ScopHelper.h | 8 + polly/lib/Analysis/DependenceInfo.cpp | 220 ++++++++- polly/lib/Analysis/PruneUnprofitable.cpp | 32 +- polly/lib/Analysis/ScopBuilder.cpp | 1 - polly/lib/Analysis/ScopDetection.cpp | 107 +++++ polly/lib/Analysis/ScopGraphPrinter.cpp | 120 ++++- polly/lib/Analysis/ScopInfo.cpp | 228 ++++++++++ polly/lib/Analysis/ScopPass.cpp | 36 ++ polly/lib/CMakeLists.txt | 3 - polly/lib/CodeGen/CodeGeneration.cpp | 72 ++- polly/lib/CodeGen/IslAst.cpp | 101 ++++- polly/lib/Exchange/JSONExporter.cpp | 160 ++++++- polly/lib/Pass/PhaseManager.cpp | 424 ------------------ polly/lib/Pass/PollyFunctionPass.cpp | 22 - polly/lib/Pass/PollyModulePass.cpp | 29 -- polly/lib/Support/DumpFunctionPass.cpp | 41 ++ polly/lib/Support/DumpModulePass.cpp | 47 ++ polly/lib/Support/PollyPasses.def | 25 +- polly/lib/Support/RegisterPasses.cpp | 401 ++++++----------- polly/lib/Support/ScopHelper.cpp | 12 + polly/lib/Transform/Canonicalization.cpp | 65 +++ polly/lib/Transform/CodePreparation.cpp | 69 +++ polly/lib/Transform/DeLICM.cpp | 125 ++++-- polly/lib/Transform/DeadCodeElimination.cpp | 44 +- polly/lib/Transform/FlattenSchedule.cpp | 139 ++++-- polly/lib/Transform/ForwardOpTree.cpp | 131 ++++-- .../lib/Transform/MaximalStaticExpansion.cpp | 81 +++- polly/lib/Transform/ScheduleOptimizer.cpp | 152 ++++++- polly/lib/Transform/ScopInliner.cpp | 46 ++ polly/lib/Transform/Simplify.cpp | 101 ++++- polly/test/CodeGen/20100617.ll | 2 +- polly/test/CodeGen/20100622.ll | 4 +- polly/test/CodeGen/20100707.ll | 2 +- polly/test/CodeGen/20100707_2.ll | 2 +- polly/test/CodeGen/20100708.ll | 2 +- polly/test/CodeGen/20100708_2.ll | 2 +- polly/test/CodeGen/20100713.ll | 2 +- polly/test/CodeGen/20100713_2.ll | 2 +- polly/test/CodeGen/20100717.ll | 2 +- polly/test/CodeGen/20100718-DomInfo-2.ll | 2 +- polly/test/CodeGen/20100718-DomInfo.ll | 2 +- .../CodeGen/20100720-MultipleConditions.ll | 2 +- .../test/CodeGen/20100809-IndependentBlock.ll | 2 +- ...0100811-ScalarDependencyBetweenBrAndCnd.ll | 2 +- polly/test/CodeGen/20101030-Overflow.ll | 2 +- polly/test/CodeGen/20101103-Overflow3.ll | 2 +- polly/test/CodeGen/20101103-signmissmatch.ll | 2 +- .../test/CodeGen/20110226-Ignore-Dead-Code.ll | 2 +- .../test/CodeGen/20110226-PHI-Node-removed.ll | 2 +- polly/test/CodeGen/20120316-InvalidCast.ll | 2 +- .../CodeGen/20120403-RHS-type-mismatch.ll | 2 +- polly/test/CodeGen/20130221.ll | 2 +- .../20150328-SCEVExpanderIntroducesNewIV.ll | 2 +- polly/test/CodeGen/Intrinsics/llvm-expect.ll | 2 +- .../do_not_mutate_debug_info.ll | 2 +- .../loop_nest_param_parallel.ll | 2 +- .../single_loop_param_parallel.ll | 4 +- polly/test/CodeGen/MemAccess/bad_alignment.ll | 2 +- .../MemAccess/codegen_address_space.ll | 2 +- .../MemAccess/codegen_constant_offset.ll | 2 +- .../test/CodeGen/MemAccess/codegen_simple.ll | 2 +- .../CodeGen/MemAccess/codegen_simple_float.ll | 2 +- .../CodeGen/MemAccess/codegen_simple_md.ll | 4 +- .../MemAccess/codegen_simple_md_float.ll | 4 +- polly/test/CodeGen/MemAccess/create_arrays.ll | 4 +- .../CodeGen/MemAccess/create_arrays_heap.ll | 4 +- .../default_aligned_new_access_function.ll | 2 +- .../test/CodeGen/MemAccess/different_types.ll | 4 +- polly/test/CodeGen/MemAccess/generate-all.ll | 6 +- .../CodeGen/MemAccess/invariant_base_ptr.ll | 4 +- .../CodeGen/MemAccess/map_scalar_access.ll | 4 +- .../test/CodeGen/MemAccess/multiple_types.ll | 4 +- polly/test/CodeGen/MemAccess/simple.ll | 2 +- .../test/CodeGen/MemAccess/simple_analyze.ll | 2 +- .../MemAccess/update_access_functions.ll | 4 +- .../CodeGen/Metadata/basic_vec_annotate.ll | 2 +- polly/test/CodeGen/OpenMP/alias-metadata.ll | 2 +- .../floord-as-argument-to-subfunction.ll | 2 +- polly/test/CodeGen/OpenMP/inlineasm.ll | 2 +- .../invariant_base_pointer_preloaded.ll | 3 +- ...ant_base_pointer_preloaded_different_bb.ll | 3 +- ...base_pointer_preloaded_pass_only_needed.ll | 3 +- .../invariant_base_pointers_preloaded.ll | 3 +- .../OpenMP/loop-body-references-outer-iv.ll | 4 +- .../loop-body-references-outer-values-2.ll | 4 +- .../loop-body-references-outer-values-3.ll | 4 +- .../loop-body-references-outer-values.ll | 4 +- .../OpenMP/loop-bounds-reference-outer-ids.ll | 4 +- .../test/CodeGen/OpenMP/mapped-phi-access.ll | 2 +- polly/test/CodeGen/OpenMP/matmul-parallel.ll | 4 +- .../CodeGen/OpenMP/new_multidim_access.ll | 8 +- polly/test/CodeGen/OpenMP/recomputed-srem.ll | 3 +- ...ference-argument-from-non-affine-region.ll | 19 +- .../test/CodeGen/OpenMP/reference-other-bb.ll | 2 +- .../OpenMP/reference-preceeding-loop.ll | 4 +- polly/test/CodeGen/OpenMP/reference_latest.ll | 2 +- polly/test/CodeGen/OpenMP/scev-rewriting.ll | 2 +- polly/test/CodeGen/OpenMP/single_loop.ll | 18 +- ...single_loop_with_loop_invariant_baseptr.ll | 4 +- .../CodeGen/OpenMP/single_loop_with_param.ll | 16 +- ...o-parallel-loops-reference-outer-indvar.ll | 4 +- polly/test/CodeGen/PHIInExit.ll | 2 +- .../combine_different_values.ll | 4 +- .../RuntimeDebugBuilder/stmt_tracing.ll | 2 +- polly/test/CodeGen/alias-check-multi-dim.ll | 3 +- .../CodeGen/alias_metadata_too_many_arrays.ll | 3 +- ...aliasing_different_base_and_access_type.ll | 2 +- .../aliasing_different_pointer_types.ll | 2 +- .../aliasing_multidimensional_access.ll | 2 +- .../CodeGen/aliasing_parametric_simple_1.ll | 2 +- .../CodeGen/aliasing_parametric_simple_2.ll | 2 +- polly/test/CodeGen/aliasing_struct_element.ll | 2 +- polly/test/CodeGen/alignment.ll | 2 +- polly/test/CodeGen/annotated_alias_scopes.ll | 2 +- polly/test/CodeGen/blas_sscal_simplified.ll | 2 +- ...code-hosting-and-escape-map-computation.ll | 2 +- polly/test/CodeGen/constant_condition.ll | 2 +- polly/test/CodeGen/create-conditional-scop.ll | 2 +- ...d_instruction_referenced_by_parameter_1.ll | 2 +- ...d_instruction_referenced_by_parameter_2.ll | 2 +- polly/test/CodeGen/debug-intrinsics.ll | 8 +- ...nce_problem_after_early_codegen_bailout.ll | 2 +- polly/test/CodeGen/empty_domain_in_context.ll | 2 +- polly/test/CodeGen/entry_with_trivial_phi.ll | 2 +- .../entry_with_trivial_phi_other_bb.ll | 2 +- .../error-stmt-in-non-affine-region.ll | 2 +- ...or_block_contains_invalid_memory_access.ll | 2 +- polly/test/CodeGen/exprModDiv.ll | 7 +- .../hoisted_load_escapes_through_phi.ll | 6 +- polly/test/CodeGen/hoisting_1.ll | 2 +- polly/test/CodeGen/hoisting_2.ll | 2 +- polly/test/CodeGen/inner_scev_sdiv_1.ll | 2 +- polly/test/CodeGen/inner_scev_sdiv_2.ll | 2 +- polly/test/CodeGen/inner_scev_sdiv_3.ll | 2 +- polly/test/CodeGen/inner_scev_sdiv_in_lb.ll | 4 +- .../inner_scev_sdiv_in_lb_invariant.ll | 3 +- polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll | 3 +- polly/test/CodeGen/intrinsics_lifetime.ll | 2 +- polly/test/CodeGen/intrinsics_misc.ll | 2 +- .../inv-load-lnt-crash-wrong-order-2.ll | 3 +- .../inv-load-lnt-crash-wrong-order-3.ll | 3 +- .../CodeGen/inv-load-lnt-crash-wrong-order.ll | 3 +- .../test/CodeGen/invariant-load-dimension.ll | 4 +- ...-load-preload-base-pointer-origin-first.ll | 2 +- .../CodeGen/invariant_cannot_handle_void.ll | 4 +- polly/test/CodeGen/invariant_load.ll | 2 +- .../CodeGen/invariant_load_address_space.ll | 2 +- .../CodeGen/invariant_load_alias_metadata.ll | 3 +- .../CodeGen/invariant_load_base_pointer.ll | 2 +- ...invariant_load_base_pointer_conditional.ll | 2 +- ...variant_load_base_pointer_conditional_2.ll | 6 +- ...ariant_load_canonicalize_array_baseptrs.ll | 4 +- .../test/CodeGen/invariant_load_condition.ll | 2 +- .../invariant_load_different_sized_types.ll | 3 +- polly/test/CodeGen/invariant_load_escaping.ll | 2 +- .../invariant_load_escaping_second_scop.ll | 2 +- .../invariant_load_in_non_affine_subregion.ll | 2 +- polly/test/CodeGen/invariant_load_loop_ub.ll | 2 +- ...ant_load_not_executed_but_in_parameters.ll | 2 +- .../test/CodeGen/invariant_load_outermost.ll | 2 +- ...riant_load_parameters_cyclic_dependence.ll | 4 +- .../CodeGen/invariant_load_ptr_ptr_noalias.ll | 2 +- .../test/CodeGen/invariant_load_scalar_dep.ll | 2 +- ...riant_load_scalar_escape_alloca_sharing.ll | 2 +- ...oads_from_struct_with_different_types_1.ll | 2 +- ...oads_from_struct_with_different_types_2.ll | 2 +- ...invariant_loads_ignore_parameter_bounds.ll | 3 +- .../invariant_verify_function_failed.ll | 2 +- .../invariant_verify_function_failed_2.ll | 4 +- polly/test/CodeGen/issue56692.ll | 2 +- .../large-numbers-in-boundary-context.ll | 2 +- .../test/CodeGen/load_subset_with_context.ll | 2 +- .../loop-invariant-load-type-mismatch.ll | 2 +- polly/test/CodeGen/loop_with_condition.ll | 2 +- polly/test/CodeGen/loop_with_condition_2.ll | 2 +- .../test/CodeGen/loop_with_condition_ineq.ll | 2 +- .../CodeGen/loop_with_condition_nested.ll | 4 +- ..._conditional_entry_edge_split_hard_case.ll | 2 +- polly/test/CodeGen/memcpy_annotations.ll | 2 +- .../multidim-non-matching-typesize-2.ll | 3 +- .../CodeGen/multidim-non-matching-typesize.ll | 3 +- ..._2d_parametric_array_static_loop_bounds.ll | 2 +- polly/test/CodeGen/multidim_alias_check.ll | 2 +- polly/test/CodeGen/multiple-codegens.ll | 4 +- polly/test/CodeGen/multiple-scops-in-a-row.ll | 2 +- .../multiple-types-invariant-load-2.ll | 3 +- .../CodeGen/multiple-types-invariant-load.ll | 3 +- .../multiple_sai_fro_same_base_address.ll | 4 +- polly/test/CodeGen/no-overflow-tracking.ll | 4 +- polly/test/CodeGen/no_guard_bb.ll | 2 +- ...non-affine-dominance-generated-entering.ll | 2 +- .../CodeGen/non-affine-exit-node-dominance.ll | 2 +- .../non-affine-phi-node-expansion-2.ll | 3 +- .../non-affine-phi-node-expansion-3.ll | 3 +- .../non-affine-phi-node-expansion-4.ll | 3 +- .../CodeGen/non-affine-phi-node-expansion.ll | 3 +- ...e-region-exit-phi-incoming-synthesize-2.ll | 2 +- ...ine-region-exit-phi-incoming-synthesize.ll | 2 +- .../non-affine-region-implicit-store.ll | 2 +- ...ine-region-phi-references-in-scop-value.ll | 3 +- .../non-affine-subregion-dominance-reuse.ll | 3 +- polly/test/CodeGen/non-affine-switch.ll | 3 +- .../non-affine-synthesized-in-branch.ll | 2 +- polly/test/CodeGen/non-affine-update.ll | 3 +- .../non-hoisted-load-needed-as-base-ptr.ll | 2 +- .../test/CodeGen/non_affine_float_compare.ll | 4 +- .../CodeGen/only_non_affine_error_region.ll | 2 +- polly/test/CodeGen/openmp_limit_threads.ll | 12 +- .../test/CodeGen/out-of-scop-phi-node-use.ll | 2 +- polly/test/CodeGen/param_div_div_div_2.ll | 4 +- polly/test/CodeGen/partial_write_array.ll | 2 +- polly/test/CodeGen/partial_write_emptyset.ll | 2 +- ...l_write_full_write_that_appears_partial.ll | 2 +- .../partial_write_impossible_restriction.ll | 2 +- polly/test/CodeGen/partial_write_in_region.ll | 5 +- .../partial_write_in_region_with_loop.ll | 5 +- .../CodeGen/partial_write_mapped_scalar.ll | 2 +- .../partial_write_mapped_scalar_subregion.ll | 2 +- polly/test/CodeGen/perf_monitoring.ll | 3 +- .../perf_monitoring_cycles_per_scop.ll | 3 +- .../perf_monitoring_trip_counts_per_scop.ll | 3 +- polly/test/CodeGen/phi-defined-before-scop.ll | 2 +- .../phi_after_error_block_outside_of_scop.ll | 2 +- .../test/CodeGen/phi_condition_modeling_1.ll | 2 +- .../test/CodeGen/phi_condition_modeling_2.ll | 2 +- .../test/CodeGen/phi_conditional_simple_1.ll | 4 +- .../phi_in_exit_early_lnt_failure_1.ll | 2 +- .../phi_in_exit_early_lnt_failure_2.ll | 2 +- .../phi_in_exit_early_lnt_failure_3.ll | 2 +- .../phi_in_exit_early_lnt_failure_5.ll | 2 +- polly/test/CodeGen/phi_loop_carried_float.ll | 2 +- .../CodeGen/phi_loop_carried_float_escape.ll | 6 +- polly/test/CodeGen/phi_scalar_simple_1.ll | 2 +- polly/test/CodeGen/phi_scalar_simple_2.ll | 2 +- .../CodeGen/phi_with_multi_exiting_edges_2.ll | 2 +- polly/test/CodeGen/phi_with_one_exit_edge.ll | 2 +- .../CodeGen/pointer-type-expressions-2.ll | 4 +- .../test/CodeGen/pointer-type-expressions.ll | 4 +- .../pointer-type-pointer-type-comparison.ll | 4 +- polly/test/CodeGen/pointer_rem.ll | 4 +- polly/test/CodeGen/pr25241.ll | 2 +- polly/test/CodeGen/ptrtoint_as_parameter.ll | 2 +- polly/test/CodeGen/read-only-scalars.ll | 8 +- polly/test/CodeGen/reduction.ll | 2 +- polly/test/CodeGen/reduction_2.ll | 2 +- polly/test/CodeGen/reduction_simple_binary.ll | 2 +- polly/test/CodeGen/reggen_domtree_crash.ll | 2 +- .../test/CodeGen/region-with-instructions.ll | 2 +- polly/test/CodeGen/region_exiting-domtree.ll | 2 +- .../CodeGen/region_multiexit_partialwrite.ll | 2 +- ...run-time-condition-with-scev-parameters.ll | 4 +- polly/test/CodeGen/run-time-condition.ll | 2 +- .../scalar-references-used-in-scop-compute.ll | 2 +- .../test/CodeGen/scalar-store-from-same-bb.ll | 3 +- polly/test/CodeGen/scalar_codegen_crash.ll | 3 +- polly/test/CodeGen/scev-backedgetaken.ll | 2 +- .../CodeGen/scev-division-invariant-load.ll | 2 +- polly/test/CodeGen/scev.ll | 2 +- .../CodeGen/scev_expansion_in_nonaffine.ll | 3 +- .../CodeGen/scev_looking_through_bitcasts.ll | 2 +- .../CodeGen/scop_expander_insert_point.ll | 3 +- polly/test/CodeGen/scop_expander_segfault.ll | 2 +- ...p_never_executed_runtime_check_location.ll | 2 +- polly/test/CodeGen/select-base-pointer.ll | 2 +- polly/test/CodeGen/sequential_loops.ll | 2 +- .../CodeGen/simple_loop_non_single_exit.ll | 2 +- .../CodeGen/simple_loop_non_single_exit_2.ll | 2 +- polly/test/CodeGen/simple_non_single_entry.ll | 2 +- polly/test/CodeGen/simple_nonaffine_loop.ll | 2 +- .../single_do_loop_int_max_iterations.ll | 2 +- .../single_do_loop_int_param_iterations.ll | 2 +- .../single_do_loop_ll_max_iterations.ll | 4 +- .../CodeGen/single_do_loop_one_iteration.ll | 2 +- .../CodeGen/single_do_loop_scev_replace.ll | 2 +- polly/test/CodeGen/single_loop.ll | 2 +- .../CodeGen/single_loop_int_max_iterations.ll | 2 +- .../CodeGen/single_loop_ll_max_iterations.ll | 2 +- .../test/CodeGen/single_loop_one_iteration.ll | 2 +- polly/test/CodeGen/single_loop_param.ll | 2 +- .../CodeGen/single_loop_param_less_equal.ll | 6 +- .../CodeGen/single_loop_param_less_than.ll | 4 +- .../CodeGen/single_loop_zero_iterations.ll | 2 +- polly/test/CodeGen/split_edge_of_exit.ll | 4 +- polly/test/CodeGen/split_edges.ll | 2 +- polly/test/CodeGen/split_edges_2.ll | 2 +- polly/test/CodeGen/srem-in-other-bb.ll | 3 +- .../stack-overflow-in-load-hoisting.ll | 3 +- .../test/CodeGen/stmt_split_no_dependence.ll | 2 +- .../CodeGen/switch-in-non-affine-region.ll | 3 +- .../synthesizable_phi_write_after_loop.ll | 2 +- .../test-invalid-operands-for-select-2.ll | 2 +- .../test-invalid-operands-for-select.ll | 2 +- polly/test/CodeGen/test.ll | 2 +- .../two-loops-right-after-each-other-2.ll | 2 +- .../two-scops-in-row-invalidate-scevs.ll | 2 +- polly/test/CodeGen/two-scops-in-row.ll | 4 +- polly/test/CodeGen/udiv_expansion_position.ll | 2 +- .../CodeGen/uninitialized_scalar_memory.ll | 2 +- .../unpredictable-loop-unsynthesizable.ll | 6 +- .../test/CodeGen/variant_load_empty_domain.ll | 2 +- .../whole-scop-non-affine-subregion.ll | 3 +- polly/test/DeLICM/confused_order.ll | 4 +- ...ontradicting_assumed_context_and_domain.ll | 2 +- polly/test/DeLICM/load-in-cond-inf-loop.ll | 2 +- polly/test/DeLICM/map_memset_zero.ll | 4 +- polly/test/DeLICM/nomap_alreadymapped.ll | 2 +- polly/test/DeLICM/nomap_escaping.ll | 2 +- polly/test/DeLICM/nomap_occupied.ll | 2 +- polly/test/DeLICM/nomap_readonly.ll | 2 +- polly/test/DeLICM/nomap_spuriouswrite.ll | 2 +- polly/test/DeLICM/nomap_storagesize.ll | 2 +- polly/test/DeLICM/nomap_writewrite.ll | 2 +- polly/test/DeLICM/outofquota-reverseDomain.ll | 2 +- polly/test/DeLICM/pass_existence.ll | 6 +- polly/test/DeLICM/pr41656.ll | 2 +- polly/test/DeLICM/pr48783.ll | 2 +- polly/test/DeLICM/reduction.ll | 2 +- .../DeLICM/reduction_constant_selfconflict.ll | 2 +- polly/test/DeLICM/reduction_looprotate.ll | 2 +- .../reduction_looprotate_alwaystaken.ll | 2 +- .../DeLICM/reduction_looprotate_gvnpre.ll | 4 +- .../reduction_looprotate_gvnpre_cond1.ll | 2 +- .../reduction_looprotate_gvnpre_cond2.ll | 2 +- ...reduction_looprotate_gvnpre_nopreheader.ll | 2 +- .../DeLICM/reduction_looprotate_hoisted.ll | 2 +- .../test/DeLICM/reduction_looprotate_licm.ll | 2 +- .../test/DeLICM/reduction_looprotate_licm2.ll | 2 +- .../reduction_looprotate_licm_double_write.ll | 5 +- .../reduction_looprotate_licm_nopreheader.ll | 2 +- .../test/DeLICM/reduction_looprotate_load.ll | 2 +- .../reduction_looprotate_loopguard_gvnpre.ll | 2 +- .../reduction_looprotate_loopguard_licm1.ll | 2 +- .../reduction_looprotate_loopguard_licm2.ll | 2 +- .../reduction_looprotate_loopguard_licm3.ll | 2 +- .../DeLICM/reduction_looprotate_readonly.ll | 2 +- .../reduction_looprotate_synthesizable.ll | 2 +- .../test/DeLICM/reduction_looprotate_undef.ll | 2 +- .../test/DeLICM/reduction_overapproximate.ll | 6 +- polly/test/DeLICM/reduction_preheader.ll | 2 +- .../test/DeLICM/reduction_unrelatedunusual.ll | 2 +- polly/test/DeLICM/reject_loadafterstore.ll | 2 +- polly/test/DeLICM/reject_outofquota.ll | 4 +- polly/test/DeLICM/reject_storeafterstore.ll | 2 +- polly/test/DeLICM/reject_storeinsubregion.ll | 2 +- polly/test/DeLICM/reject_unusualstore.ll | 4 +- polly/test/DeLICM/skip_maywrite.ll | 2 +- polly/test/DeLICM/skip_multiaccess.ll | 2 +- polly/test/DeLICM/skip_notinloop.ll | 2 +- polly/test/DeLICM/skip_scalaraccess.ll | 2 +- .../DeadCodeElimination/chained_iterations.ll | 4 +- .../chained_iterations_2.ll | 4 +- polly/test/DeadCodeElimination/computeout.ll | 4 +- .../dead_iteration_elimination.ll | 2 +- .../non-affine-affine-mix.ll | 2 +- polly/test/DeadCodeElimination/non-affine.ll | 2 +- .../test/DeadCodeElimination/null_schedule.ll | 2 +- polly/test/DependenceInfo/computeout.ll | 4 +- .../different_schedule_dimensions.ll | 3 +- polly/test/DependenceInfo/do_pluto_matmult.ll | 4 +- polly/test/DependenceInfo/fine_grain_dep_0.ll | 4 +- .../generate_may_write_dependence_info.ll | 2 +- .../test/DependenceInfo/infeasible_context.ll | 6 +- ...writes_do_not_block_must_writes_for_war.ll | 2 +- .../nonaffine-condition-buildMemoryAccess.ll | 2 +- .../reduction_complex_location.ll | 6 +- ...ndences_equal_non_reduction_dependences.ll | 2 +- .../reduction_dependences_not_null.ll | 2 +- .../reduction_indirect_access.ll | 2 +- ...reduction_and_non_reduction_dependences.ll | 2 +- .../reduction_multiple_loops_array_sum.ll | 6 +- .../reduction_multiple_loops_array_sum_2.ll | 2 +- .../reduction_multiple_loops_array_sum_3.ll | 2 +- .../reduction_multiple_reductions.ll | 2 +- .../reduction_multiple_reductions_2.ll | 2 +- .../reduction_only_reduction_like_access.ll | 2 +- ...lly_escaping_intermediate_in_other_stmt.ll | 2 +- .../reduction_privatization_deps.ll | 2 +- .../reduction_privatization_deps_2.ll | 2 +- .../reduction_privatization_deps_3.ll | 2 +- .../reduction_privatization_deps_4.ll | 2 +- .../reduction_privatization_deps_5.ll | 2 +- .../test/DependenceInfo/reduction_sequence.ll | 2 +- .../DependenceInfo/reduction_simple_iv.ll | 2 +- ...ion_simple_iv_debug_wrapped_dependences.ll | 2 +- .../reduction_simple_privatization_deps_2.ll | 2 +- ...n_simple_privatization_deps_w_parameter.ll | 2 +- ...duction_two_reductions_different_rloops.ll | 2 +- polly/test/DependenceInfo/sequential_loops.ll | 6 +- polly/test/FlattenSchedule/gemm.ll | 2 +- polly/test/ForwardOpTree/atax.ll | 2 +- polly/test/ForwardOpTree/changed-kind.ll | 2 +- .../test/ForwardOpTree/forward_from_region.ll | 2 +- polly/test/ForwardOpTree/forward_hoisted.ll | 2 +- .../test/ForwardOpTree/forward_instruction.ll | 2 +- .../test/ForwardOpTree/forward_into_region.ll | 2 +- .../forward_into_region_redundant_use.ll | 2 +- polly/test/ForwardOpTree/forward_load.ll | 2 +- .../forward_load_differentarray.ll | 2 +- .../forward_load_double_write.ll | 2 +- .../ForwardOpTree/forward_load_fromloop.ll | 2 +- .../ForwardOpTree/forward_load_indirect.ll | 2 +- .../forward_load_memset_after.ll | 2 +- .../forward_load_memset_before.ll | 2 +- .../ForwardOpTree/forward_load_tripleuse.ll | 2 +- .../forward_load_unrelatedunusual.ll | 2 +- polly/test/ForwardOpTree/forward_phi_load.ll | 2 +- polly/test/ForwardOpTree/forward_readonly.ll | 4 +- polly/test/ForwardOpTree/forward_reusue.ll | 2 +- polly/test/ForwardOpTree/forward_store.ll | 2 +- .../forward_synthesizable_definloop.ll | 2 +- .../forward_synthesizable_indvar.ll | 2 +- .../forward_synthesizable_useinloop.ll | 2 +- .../test/ForwardOpTree/forward_transitive.ll | 2 +- polly/test/ForwardOpTree/jacobi-1d.ll | 2 +- .../ForwardOpTree/noforward_from_region.ll | 2 +- .../noforward_load_conditional.ll | 2 +- .../noforward_load_writebetween.ll | 2 +- .../ForwardOpTree/noforward_outofquota.ll | 4 +- polly/test/ForwardOpTree/noforward_partial.ll | 2 +- polly/test/ForwardOpTree/noforward_phi.ll | 2 +- .../ForwardOpTree/noforward_selfrefphi.ll | 2 +- .../ForwardOpTree/noforward_sideffects.ll | 2 +- .../noforward_synthesizable_unknownit.ll | 2 +- polly/test/ForwardOpTree/out-of-quota1.ll | 2 +- .../OpenMP/multiple_loops_outer_parallel.ll | 2 +- .../OpenMP/nested_loop_both_parallel.ll | 2 +- .../nested_loop_both_parallel_parametric.ll | 2 +- .../OpenMP/nested_loop_inner_parallel.ll | 2 +- .../OpenMP/nested_loop_outer_parallel.ll | 2 +- .../OpenMP/single_loop_param_non_parallel.ll | 2 +- .../OpenMP/single_loop_param_parallel.ll | 2 +- .../single_loop_param_parallel_computeout.ll | 2 +- .../alias_checks_with_empty_context.ll | 3 +- polly/test/IstAstInfo/alias_simple_1.ll | 10 +- polly/test/IstAstInfo/alias_simple_2.ll | 12 +- polly/test/IstAstInfo/alias_simple_3.ll | 10 +- .../aliasing_arrays_with_identical_base.ll | 4 +- .../aliasing_multiple_alias_groups.ll | 4 +- .../aliasing_parametric_simple_1.ll | 2 +- .../aliasing_parametric_simple_2.ll | 2 +- .../dependence_distance_constant.ll | 2 +- .../IstAstInfo/dependence_distance_minimal.ll | 2 +- .../dependence_distance_multiple_constant.ll | 2 +- .../dependence_distance_parametric.ll | 2 +- .../dependence_distance_parametric_expr.ll | 2 +- .../IstAstInfo/dependence_distance_varying.ll | 2 +- ...pendence_distance_varying_in_outer_loop.ll | 2 +- .../dependence_distance_varying_multiple.ll | 2 +- .../domain_bounded_only_with_context.ll | 2 +- polly/test/IstAstInfo/non_affine_access.ll | 2 +- ...duction_clauses_multidimensional_access.ll | 2 +- ...reduction_clauses_onedimensional_access.ll | 2 +- ...ndences_equal_non_reduction_dependences.ll | 2 +- .../reduction_different_reduction_clauses.ll | 2 +- .../IstAstInfo/reduction_in_one_dimension.ll | 2 +- .../IstAstInfo/reduction_loop_reversal.ll | 2 +- ...ction_modulo_and_loop_reversal_schedule.ll | 2 +- ...ion_modulo_and_loop_reversal_schedule_2.ll | 2 +- .../IstAstInfo/reduction_modulo_schedule.ll | 2 +- ...ion_modulo_schedule_multiple_dimensions.ll | 2 +- ...n_modulo_schedule_multiple_dimensions_2.ll | 2 +- ...n_modulo_schedule_multiple_dimensions_3.ll | 2 +- ...n_modulo_schedule_multiple_dimensions_4.ll | 2 +- ...n_modulo_schedule_multiple_dimensions_5.ll | 2 +- .../reduction_multiple_dimensions.ll | 2 +- .../reduction_multiple_dimensions_2.ll | 2 +- .../reduction_multiple_dimensions_3.ll | 2 +- .../reduction_multiple_dimensions_4.ll | 2 +- polly/test/IstAstInfo/run-time-condition.ll | 2 +- .../runtime_context_with_error_blocks.ll | 2 +- .../IstAstInfo/simple-run-time-condition.ll | 2 +- .../test/IstAstInfo/single_loop_strip_mine.ll | 4 +- .../single_loop_uint_max_iterations.ll | 2 +- .../single_loop_ull_max_iterations.ll | 2 +- .../ImportAccesses-Bad-relation.ll | 2 +- .../ImportAccesses-No-accesses-key.ll | 2 +- .../ImportAccesses-Not-enough-MemAcc.ll | 2 +- .../ImportAccesses-Not-enough-statements.ll | 2 +- .../ImportAccesses-Relation-mispelled.ll | 2 +- .../ImportAccesses-Statements-mispelled.ll | 2 +- ...ImportAccesses-Undeclared-ScopArrayInfo.ll | 2 +- .../ImportAccesses-Wrong-number-dimensions.ll | 2 +- .../ImportArrays-Mispelled-type.ll | 2 +- .../ImportArrays-Negative-size.ll | 2 +- .../ImportArrays/ImportArrays-No-name.ll | 2 +- .../ImportArrays/ImportArrays-No-sizes-key.ll | 2 +- .../ImportArrays/ImportArrays-No-type-key.ll | 2 +- .../ImportContext-Context-mispelled.ll | 2 +- .../ImportContext-Not-parameter-set.ll | 2 +- .../ImportContext-Unvalid-Context.ll | 2 +- .../ImportContext-Wrong-dimension.ll | 2 +- .../ImportSchedule-No-schedule-key.ll | 2 +- .../ImportSchedule-Schedule-not-valid.ll | 2 +- .../ImportSchedule-Statements-mispelled.ll | 2 +- .../ImportSchedule-Wrong-number-statements.ll | 2 +- .../load_after_store_same_statement.ll | 4 +- .../read_from_original.ll | 4 +- .../MaximalStaticExpansion/too_many_writes.ll | 4 +- .../working_deps_between_inners.ll | 2 +- .../working_deps_between_inners_phi.ll | 4 +- .../working_expansion.ll | 2 +- ...sion_multiple_dependences_per_statement.ll | 2 +- ...sion_multiple_instruction_per_statement.ll | 2 +- .../working_phi_expansion.ll | 4 +- .../working_phi_two_scalars.ll | 4 +- .../working_value_expansion.ll | 2 +- .../prune_only_scalardeps.ll | 2 +- .../2012-03-16-Empty-Domain.ll | 2 +- .../2013-04-11-Empty-Domain-two.ll | 2 +- .../GreedyFuse/fuse-double.ll | 4 +- .../GreedyFuse/fuse-except-first.ll | 4 +- .../GreedyFuse/fuse-except-third.ll | 4 +- .../GreedyFuse/fuse-inner-carried.ll | 4 +- .../GreedyFuse/fuse-inner-third.ll | 4 +- .../GreedyFuse/fuse-inner.ll | 4 +- .../GreedyFuse/fuse-simple.ll | 4 +- .../GreedyFuse/nofuse-simple.ll | 4 +- .../GreedyFuse/nofuse-with-middle.ll | 4 +- .../ManualOptimization/disable_nonforced.ll | 2 +- .../distribute_heuristic.ll | 4 +- .../distribute_illegal_looploc.ll | 2 +- .../distribute_illegal_pragmaloc.ll | 2 +- .../ManualOptimization/unroll_disable.ll | 2 +- .../ManualOptimization/unroll_double.ll | 2 +- .../ManualOptimization/unroll_full.ll | 2 +- .../ManualOptimization/unroll_heuristic.ll | 4 +- .../ManualOptimization/unroll_partial.ll | 4 +- .../unroll_partial_followup.ll | 8 +- .../ScheduleOptimizer/SIMDInParallelFor.ll | 2 +- polly/test/ScheduleOptimizer/computeout.ll | 4 +- .../ensure-correct-tile-sizes.ll | 7 +- .../focaltech_test_detail_threshold-7bc17e.ll | 2 +- .../full_partial_tile_separation.ll | 2 +- polly/test/ScheduleOptimizer/line-tiling-2.ll | 2 +- polly/test/ScheduleOptimizer/line-tiling.ll | 2 +- .../mat_mul_pattern_data_layout.ll | 11 +- .../mat_mul_pattern_data_layout_2.ll | 10 +- .../ScheduleOptimizer/one-dimensional-band.ll | 2 +- .../ScheduleOptimizer/outer_coincidence.ll | 4 +- ...attern-matching-based-opts-after-delicm.ll | 6 +- ...tern-matching-based-opts-after-delicm_2.ll | 4 +- .../pattern-matching-based-opts.ll | 9 +- .../pattern-matching-based-opts_11.ll | 14 +- .../pattern-matching-based-opts_12.ll | 10 +- .../pattern-matching-based-opts_13.ll | 10 +- .../pattern-matching-based-opts_14.ll | 11 +- .../pattern-matching-based-opts_15.ll | 4 +- .../pattern-matching-based-opts_16.ll | 3 +- .../pattern-matching-based-opts_17.ll | 3 +- .../pattern-matching-based-opts_18.ll | 3 +- .../pattern-matching-based-opts_19.ll | 3 +- .../pattern-matching-based-opts_2.ll | 3 +- .../pattern-matching-based-opts_20.ll | 3 +- .../pattern-matching-based-opts_21.ll | 3 +- .../pattern-matching-based-opts_22.ll | 3 +- .../pattern-matching-based-opts_24.ll | 4 +- .../pattern-matching-based-opts_25.ll | 3 +- .../pattern-matching-based-opts_3.ll | 17 +- .../pattern-matching-based-opts_4.ll | 12 +- .../pattern-matching-based-opts_5.ll | 10 +- .../pattern-matching-based-opts_6.ll | 10 +- .../pattern-matching-based-opts_7.ll | 10 +- .../pattern-matching-based-opts_8.ll | 10 +- .../pattern-matching-based-opts_9.ll | 12 +- .../pattern_matching_based_opts_splitmap.ll | 2 +- .../prevectorization-without-tiling.ll | 2 +- .../ScheduleOptimizer/prevectorization.ll | 4 +- .../prevectorization_islbound.ll | 2 +- .../ScheduleOptimizer/rectangular-tiling.ll | 8 +- .../ScheduleOptimizer/schedule_computeout.ll | 2 +- polly/test/ScheduleOptimizer/statistics.ll | 2 +- .../ScheduleOptimizer/tile_after_fusion.ll | 2 +- ...vivid_vbi_gen_sliced-before-llvmreduced.ll | 2 +- .../aliasing_parametric_simple_1.ll | 2 +- .../aliasing_parametric_simple_2.ll | 2 +- polly/test/ScopDetect/aliasing_simple_1.ll | 2 +- polly/test/ScopDetect/aliasing_simple_2.ll | 2 +- polly/test/ScopDetect/base_pointer.ll | 2 +- .../base_pointer_load_setNewAccessRelation.ll | 2 +- .../base_pointer_setNewAccessRelation.ll | 4 +- polly/test/ScopDetect/callbr.ll | 4 +- .../ScopDetect/collective_invariant_loads.ll | 2 +- .../ScopDetect/cross_loop_non_single_exit.ll | 2 +- .../cross_loop_non_single_exit_2.ll | 2 +- ...ependency_to_phi_node_outside_of_region.ll | 2 +- .../test/ScopDetect/detect-full-functions.ll | 2 +- polly/test/ScopDetect/dom-tree-crash.ll | 2 +- polly/test/ScopDetect/dot-scops-npm.ll | 2 +- polly/test/ScopDetect/dot-scops.ll | 2 +- .../ScopDetect/error-block-always-executed.ll | 2 +- .../error-block-referenced-from-scop.ll | 2 +- .../ScopDetect/error-block-unreachable.ll | 2 +- .../ScopDetect/expand-region-correctly-2.ll | 2 +- .../ScopDetect/expand-region-correctly.ll | 2 +- .../test/ScopDetect/ignore_func_flag_regex.ll | 2 +- .../index_from_unpredictable_loop.ll | 4 +- .../index_from_unpredictable_loop2.ll | 4 +- polly/test/ScopDetect/indvars.ll | 2 +- polly/test/ScopDetect/intrinsics_1.ll | 2 +- polly/test/ScopDetect/intrinsics_2.ll | 2 +- polly/test/ScopDetect/intrinsics_3.ll | 2 +- .../ScopDetect/invalid-latch-conditions.ll | 6 +- .../ScopDetect/invalidate_scalar_evolution.ll | 2 +- .../ScopDetect/invariant-load-before-scop.ll | 2 +- polly/test/ScopDetect/keep_going_expansion.ll | 2 +- polly/test/ScopDetect/mod_ref_read_pointer.ll | 4 +- polly/test/ScopDetect/more-than-one-loop.ll | 4 +- .../ScopDetect/multidim-with-undef-size.ll | 2 +- polly/test/ScopDetect/multidim.ll | 2 +- .../ScopDetect/multidim_indirect_access.ll | 2 +- ..._two_accesses_different_delinearization.ll | 2 +- .../ScopDetect/nested_loop_single_exit.ll | 4 +- .../test/ScopDetect/non-affine-conditional.ll | 2 +- .../ScopDetect/non-affine-float-compare.ll | 2 +- ...-affine-loop-condition-dependent-access.ll | 8 +- ...ffine-loop-condition-dependent-access_2.ll | 6 +- ...ffine-loop-condition-dependent-access_3.ll | 6 +- polly/test/ScopDetect/non-affine-loop.ll | 10 +- .../non-beneficial-loops-small-trip-count.ll | 2 +- .../non-constant-add-rec-start-expr.ll | 2 +- .../ScopDetect/non-simple-memory-accesses.ll | 2 +- .../ScopDetect/non_affine_loop_condition.ll | 4 +- polly/test/ScopDetect/only-one-affine-loop.ll | 2 +- polly/test/ScopDetect/only_func_flag.ll | 2 +- polly/test/ScopDetect/only_func_flag_regex.ll | 2 +- .../parametric-multiply-in-scev-2.ll | 2 +- .../ScopDetect/parametric-multiply-in-scev.ll | 2 +- .../phi_with_multi_exiting_edges.ll | 2 +- .../profitability-large-basic-blocks.ll | 10 +- .../profitability-two-nested-loops.ll | 2 +- polly/test/ScopDetect/remove_all_children.ll | 2 +- polly/test/ScopDetect/report-scop-location.ll | 2 +- .../restrict-undef-size-scopdetect.ll | 2 +- polly/test/ScopDetect/run_time_alias_check.ll | 2 +- polly/test/ScopDetect/scev_remove_max.ll | 2 +- polly/test/ScopDetect/sequential_loops.ll | 2 +- polly/test/ScopDetect/simple_loop.ll | 2 +- .../simple_loop_non_single_entry.ll | 2 +- .../ScopDetect/simple_loop_non_single_exit.ll | 2 +- .../simple_loop_non_single_exit_2.ll | 2 +- .../ScopDetect/simple_loop_two_phi_nodes.ll | 2 +- .../test/ScopDetect/simple_loop_with_param.ll | 2 +- .../ScopDetect/simple_loop_with_param_2.ll | 2 +- .../ScopDetect/simple_non_single_entry.ll | 2 +- .../ScopDetect/skip_function_attribute.ll | 2 +- .../srem_with_parametric_divisor.ll | 2 +- polly/test/ScopDetect/statistics.ll | 2 +- polly/test/ScopDetect/switch-in-loop-patch.ll | 2 +- .../test/ScopDetect/tlr_is_hoistable_load.ll | 2 +- .../ReportAlias-01.ll | 2 +- .../ScopDetectionDiagnostics/ReportEntry.ll | 2 +- .../ReportFuncCall-01.ll | 2 +- .../ReportIrreducibleRegion.ll | 2 +- .../ReportIrreducibleRegionWithoutDebugLoc.ll | 2 +- .../ReportLoopBound-01.ll | 16 +- .../ReportLoopHasNoExit.ll | 4 +- .../ReportMultipleNonAffineAccesses.ll | 12 +- .../ReportNonAffineAccess-01.ll | 2 +- .../ReportUnprofitable.ll | 8 +- .../ReportUnreachableInExit.ll | 3 +- .../ReportVariantBasePtr-01.ll | 2 +- .../loop_has_multiple_exits.ll | 2 +- .../loop_partially_in_scop-2.ll | 2 +- .../loop_partially_in_scop.ll | 2 +- .../ScopInfo/20110312-Fail-without-basicaa.ll | 2 +- .../20111108-Parameter-not-detected.ll | 2 +- ...03-16-Crash-because-of-unsigned-in-scev.ll | 2 +- .../2015-10-04-Crash-in-domain-generation.ll | 2 +- polly/test/ScopInfo/Alias-0.ll | 4 +- polly/test/ScopInfo/Alias-1.ll | 4 +- polly/test/ScopInfo/Alias-2.ll | 4 +- polly/test/ScopInfo/Alias-3.ll | 4 +- polly/test/ScopInfo/Alias-4.ll | 4 +- .../test/ScopInfo/BoundChecks/single-loop.ll | 4 +- polly/test/ScopInfo/BoundChecks/two-loops.ll | 4 +- polly/test/ScopInfo/NonAffine/div_backedge.ll | 2 +- polly/test/ScopInfo/NonAffine/div_domain.ll | 2 +- ...nt_loads_dependent_in_non_affine_region.ll | 2 +- .../ScopInfo/NonAffine/modulo_backedge.ll | 2 +- .../test/ScopInfo/NonAffine/modulo_domain.ll | 2 +- ...ffine-loop-condition-dependent-access_1.ll | 4 +- ...ffine-loop-condition-dependent-access_2.ll | 6 +- ...ffine-loop-condition-dependent-access_3.ll | 6 +- .../non_affine_access_with_range_2.ll | 2 +- .../ScopInfo/NonAffine/non_affine_but_sdiv.ll | 2 +- .../ScopInfo/NonAffine/non_affine_but_srem.ll | 2 +- .../non_affine_conditional_nested.ll | 2 +- ...ine_conditional_surrounding_affine_loop.ll | 11 +- ...conditional_surrounding_non_affine_loop.ll | 16 +- .../NonAffine/non_affine_float_compare.ll | 2 +- .../NonAffine/non_affine_loop_condition.ll | 6 +- .../NonAffine/non_affine_loop_used_later.ll | 4 +- .../NonAffine/non_affine_parametric_loop.ll | 2 +- .../non_affine_region_guaranteed_non-entry.ll | 2 +- ...whole-scop-non-affine-subregion-in-loop.ll | 2 +- .../aliasing_conditional_alias_groups_1.ll | 2 +- .../aliasing_conditional_alias_groups_2.ll | 2 +- polly/test/ScopInfo/aliasing_dead_access.ll | 2 +- .../aliasing_many_arrays_to_compare.ll | 7 +- ...iasing_many_parameters_not_all_involved.ll | 4 +- .../aliasing_many_read_only_acesses.ll | 2 +- .../aliasing_multiple_alias_groups.ll | 4 +- .../aliasing_with_non_affine_access.ll | 2 +- .../allow-all-parameters-dereferencable.ll | 11 +- polly/test/ScopInfo/assume_gep_bounds.ll | 4 +- polly/test/ScopInfo/assume_gep_bounds_2.ll | 3 +- polly/test/ScopInfo/assume_gep_bounds_many.ll | 3 +- .../avoid_new_parameters_from_geps.ll | 2 +- polly/test/ScopInfo/bool-addrec.ll | 2 +- .../test/ScopInfo/bounded_loop_assumptions.ll | 2 +- ...ces-loop-scev-with-unknown-iterations-2.ll | 6 +- ...ces-loop-scev-with-unknown-iterations-3.ll | 7 +- ...ences-loop-scev-with-unknown-iterations.ll | 7 +- polly/test/ScopInfo/bug_2010_10_22.ll | 2 +- polly/test/ScopInfo/bug_2011_1_5.ll | 2 +- .../test/ScopInfo/bug_scev_not_fully_eval.ll | 2 +- polly/test/ScopInfo/cfg_consequences.ll | 2 +- .../test/ScopInfo/complex-branch-structure.ll | 3 +- polly/test/ScopInfo/complex-condition.ll | 4 +- polly/test/ScopInfo/complex-expression.ll | 4 +- polly/test/ScopInfo/complex-loop-nesting.ll | 2 +- .../ScopInfo/complex-successor-structure-2.ll | 4 +- .../ScopInfo/complex-successor-structure-3.ll | 3 +- .../ScopInfo/complex-successor-structure.ll | 4 +- .../complex_domain_binary_condition.ll | 3 +- .../ScopInfo/complex_execution_context.ll | 4 +- polly/test/ScopInfo/cond_constant_in_loop.ll | 2 +- polly/test/ScopInfo/cond_in_loop.ll | 2 +- .../ScopInfo/condition-after-error-block-2.ll | 2 +- ...condition-after-error-block-before-scop.ll | 2 +- .../ScopInfo/condtion-after-error-block.ll | 2 +- polly/test/ScopInfo/const_srem_sdiv.ll | 3 +- .../constant-non-integer-branch-condition.ll | 2 +- .../ScopInfo/constant_factor_in_parameter.ll | 4 +- ...stant_functions_outside_scop_as_unknown.ll | 2 +- polly/test/ScopInfo/constant_start_integer.ll | 2 +- polly/test/ScopInfo/debug_call.ll | 2 +- .../delinearize-together-all-data-refs.ll | 2 +- polly/test/ScopInfo/div_by_zero.ll | 2 +- .../do-not-model-error-block-accesses.ll | 2 +- .../eager-binary-and-or-conditions.ll | 4 +- .../early_exit_for_complex_domains.ll | 2 +- polly/test/ScopInfo/error-blocks-1.ll | 2 +- polly/test/ScopInfo/error-blocks-2.ll | 3 +- polly/test/ScopInfo/error-blocks-3.ll | 2 +- polly/test/ScopInfo/escaping_empty_scop.ll | 2 +- polly/test/ScopInfo/exit-phi-1.ll | 4 +- polly/test/ScopInfo/exit-phi-2.ll | 2 +- polly/test/ScopInfo/exit_phi_accesses-2.ll | 2 +- polly/test/ScopInfo/exit_phi_accesses.ll | 2 +- .../ScopInfo/expensive-boundary-context.ll | 3 +- ...onstant_factor_introduces_new_parameter.ll | 4 +- polly/test/ScopInfo/full-function.ll | 6 +- polly/test/ScopInfo/granularity_same_name.ll | 8 +- .../test/ScopInfo/granularity_scalar-indep.ll | 2 +- ...ity_scalar-indep_cross-referencing-phi1.ll | 2 +- ...ity_scalar-indep_cross-referencing-phi2.ll | 2 +- .../granularity_scalar-indep_epilogue.ll | 2 +- .../granularity_scalar-indep_epilogue_last.ll | 2 +- .../granularity_scalar-indep_noepilogue.ll | 2 +- .../granularity_scalar-indep_ordered-2.ll | 2 +- .../granularity_scalar-indep_ordered.ll | 2 +- polly/test/ScopInfo/i1_params.ll | 2 +- polly/test/ScopInfo/infeasible-rtc.ll | 6 +- .../ScopInfo/infeasible_invalid_context.ll | 6 +- polly/test/ScopInfo/int2ptr_ptr2int.ll | 4 +- polly/test/ScopInfo/int2ptr_ptr2int_2.ll | 6 +- polly/test/ScopInfo/integers.ll | 2 +- .../ScopInfo/inter-error-bb-dependence.ll | 2 +- polly/test/ScopInfo/inter_bb_scalar_dep.ll | 3 +- .../intra-non-affine-stmt-phi-node.ll | 3 +- .../ScopInfo/intra_and_inter_bb_scalar_dep.ll | 3 +- polly/test/ScopInfo/intra_bb_scalar_dep.ll | 3 +- polly/test/ScopInfo/intrinsics.ll | 2 +- ..._add_rec_after_invariant_load_remapping.ll | 2 +- .../invalidate_iterator_during_MA_removal.ll | 2 +- .../test/ScopInfo/invariant-load-instlist.ll | 2 +- ...ariant-loads-leave-read-only-statements.ll | 4 +- polly/test/ScopInfo/invariant_load.ll | 2 +- ...load_access_classes_different_base_type.ll | 4 +- ...ss_classes_different_base_type_escaping.ll | 4 +- ...lasses_different_base_type_same_pointer.ll | 4 +- ...fferent_base_type_same_pointer_escaping.ll | 4 +- .../ScopInfo/invariant_load_addrec_sum.ll | 2 +- .../ScopInfo/invariant_load_base_pointer.ll | 2 +- ...invariant_load_base_pointer_conditional.ll | 2 +- ...ariant_load_base_pointer_in_conditional.ll | 2 +- .../invariant_load_branch_condition.ll | 3 +- ...ariant_load_canonicalize_array_baseptrs.ll | 4 +- ...iant_load_canonicalize_array_baseptrs_2.ll | 4 +- ...iant_load_canonicalize_array_baseptrs_3.ll | 4 +- ...iant_load_canonicalize_array_baseptrs_4.ll | 4 +- ...ant_load_canonicalize_array_baseptrs_4b.ll | 4 +- ...ant_load_canonicalize_array_baseptrs_4c.ll | 4 +- ...iant_load_canonicalize_array_baseptrs_5.ll | 4 +- .../invariant_load_complex_condition.ll | 3 +- .../test/ScopInfo/invariant_load_condition.ll | 2 +- .../invariant_load_dereferenceable.ll | 4 +- ...iant_load_distinct_parameter_valuations.ll | 2 +- .../ScopInfo/invariant_load_in_non_affine.ll | 3 +- polly/test/ScopInfo/invariant_load_loop_ub.ll | 4 +- .../invariant_load_ptr_ptr_noalias.ll | 3 +- .../ScopInfo/invariant_load_scalar_dep.ll | 2 +- .../ScopInfo/invariant_load_stmt_domain.ll | 2 +- .../invariant_load_zext_parameter-2.ll | 4 +- .../ScopInfo/invariant_load_zext_parameter.ll | 4 +- ...load_zextended_in_own_execution_context.ll | 4 +- ...invariant_loads_complicated_dependences.ll | 2 +- .../invariant_loads_cyclic_dependences.ll | 2 +- polly/test/ScopInfo/invariant_loop_bounds.ll | 2 +- ...ariant_same_loop_bound_multiple_times-1.ll | 2 +- ...ariant_same_loop_bound_multiple_times-2.ll | 2 +- polly/test/ScopInfo/isl_aff_out_of_bounds.ll | 2 +- polly/test/ScopInfo/isl_trip_count_01.ll | 2 +- polly/test/ScopInfo/isl_trip_count_02.ll | 2 +- polly/test/ScopInfo/isl_trip_count_03.ll | 2 +- .../isl_trip_count_multiple_exiting_blocks.ll | 2 +- polly/test/ScopInfo/licm_load.ll | 31 +- polly/test/ScopInfo/licm_potential_store.ll | 79 +++- .../ScopInfo/licm_potential_store_mssa.ll | 50 --- polly/test/ScopInfo/licm_reduction_nested.ll | 4 +- .../long-compile-time-alias-analysis.ll | 2 +- .../long-sequence-of-error-blocks-2.ll | 2 +- .../ScopInfo/long-sequence-of-error-blocks.ll | 3 +- .../test/ScopInfo/loop-multiexit-succ-cond.ll | 4 +- polly/test/ScopInfo/loop_affine_bound_0.ll | 4 +- polly/test/ScopInfo/loop_affine_bound_1.ll | 4 +- polly/test/ScopInfo/loop_affine_bound_2.ll | 4 +- polly/test/ScopInfo/loop_carry.ll | 2 +- .../test/ScopInfo/many-scalar-dependences.ll | 2 +- polly/test/ScopInfo/max-loop-depth.ll | 2 +- polly/test/ScopInfo/memcpy-raw-source.ll | 2 +- polly/test/ScopInfo/memcpy.ll | 4 +- polly/test/ScopInfo/memmove.ll | 4 +- polly/test/ScopInfo/memset.ll | 4 +- polly/test/ScopInfo/memset_null.ll | 4 +- .../ScopInfo/mismatching-array-dimensions.ll | 2 +- .../mod_ref_access_pointee_arguments.ll | 6 +- .../mod_ref_read_pointee_arguments.ll | 6 +- polly/test/ScopInfo/mod_ref_read_pointer.ll | 4 +- polly/test/ScopInfo/mod_ref_read_pointers.ll | 6 +- polly/test/ScopInfo/modulo_zext_1.ll | 2 +- polly/test/ScopInfo/modulo_zext_2.ll | 2 +- polly/test/ScopInfo/modulo_zext_3.ll | 2 +- polly/test/ScopInfo/multi-scop.ll | 2 +- .../ScopInfo/multidim_2d-diagonal-matrix.ll | 4 +- .../multidim_2d_outer_parametric_offset.ll | 2 +- ..._2d_parametric_array_static_loop_bounds.ll | 2 +- .../ScopInfo/multidim_2d_with_modref_call.ll | 8 +- .../multidim_2d_with_modref_call_2.ll | 8 +- ..._3d_parametric_array_static_loop_bounds.ll | 2 +- ...idim_fixedsize_different_dimensionality.ll | 2 +- .../multidim_fixedsize_multi_offset.ll | 2 +- .../ScopInfo/multidim_fold_constant_dim.ll | 2 +- .../multidim_fold_constant_dim_zero.ll | 2 +- polly/test/ScopInfo/multidim_fortran_2d.ll | 3 +- .../ScopInfo/multidim_fortran_2d_params.ll | 4 +- .../multidim_fortran_2d_with_modref_call.ll | 8 +- polly/test/ScopInfo/multidim_fortran_srem.ll | 2 +- .../test/ScopInfo/multidim_gep_pointercast.ll | 2 +- .../ScopInfo/multidim_gep_pointercast2.ll | 2 +- .../ScopInfo/multidim_invalid_dimension.ll | 2 +- .../multidim_ivs_and_integer_offsets_3d.ll | 2 +- ...multidim_ivs_and_parameteric_offsets_3d.ll | 2 +- .../test/ScopInfo/multidim_many_references.ll | 4 +- .../ScopInfo/multidim_nested_start_integer.ll | 4 +- .../multidim_nested_start_share_parameter.ll | 2 +- polly/test/ScopInfo/multidim_only_ivs_2d.ll | 2 +- polly/test/ScopInfo/multidim_only_ivs_3d.ll | 2 +- .../ScopInfo/multidim_only_ivs_3d_cast.ll | 2 +- .../ScopInfo/multidim_only_ivs_3d_reverse.ll | 2 +- .../ScopInfo/multidim_param_in_subscript-2.ll | 2 +- .../ScopInfo/multidim_param_in_subscript.ll | 2 +- .../multidim_parameter_addrec_product.ll | 2 +- .../multidim_single_and_multidim_array.ll | 16 +- polly/test/ScopInfo/multidim_srem.ll | 2 +- polly/test/ScopInfo/multidim_with_bitcast.ll | 2 +- .../ScopInfo/multiple-binary-or-conditions.ll | 4 +- ...ss-offset-not-dividable-by-element-size.ll | 4 +- .../ScopInfo/multiple-types-non-affine-2.ll | 4 +- .../ScopInfo/multiple-types-non-affine.ll | 4 +- .../multiple-types-non-power-of-two-2.ll | 2 +- .../multiple-types-non-power-of-two.ll | 2 +- .../multiple-types-two-dimensional-2.ll | 4 +- .../multiple-types-two-dimensional.ll | 4 +- polly/test/ScopInfo/multiple-types.ll | 3 +- .../test/ScopInfo/multiple_exiting_blocks.ll | 2 +- .../multiple_exiting_blocks_two_loop.ll | 2 +- polly/test/ScopInfo/multiple_latch_blocks.ll | 2 +- polly/test/ScopInfo/nested-loops.ll | 2 +- .../no-scalar-deps-in-non-affine-subregion.ll | 2 +- polly/test/ScopInfo/non-affine-region-phi.ll | 4 +- .../ScopInfo/non-affine-region-with-loop-2.ll | 2 +- .../ScopInfo/non-affine-region-with-loop.ll | 4 +- polly/test/ScopInfo/non-precise-inv-load-1.ll | 2 +- polly/test/ScopInfo/non-precise-inv-load-2.ll | 2 +- polly/test/ScopInfo/non-precise-inv-load-3.ll | 2 +- polly/test/ScopInfo/non-precise-inv-load-4.ll | 2 +- polly/test/ScopInfo/non-precise-inv-load-5.ll | 2 +- polly/test/ScopInfo/non-precise-inv-load-6.ll | 2 +- polly/test/ScopInfo/non-pure-function-call.ll | 2 +- ...-pure-function-calls-causes-dead-blocks.ll | 2 +- .../test/ScopInfo/non-pure-function-calls.ll | 2 +- polly/test/ScopInfo/non_affine_access.ll | 4 +- polly/test/ScopInfo/non_affine_region_1.ll | 2 +- polly/test/ScopInfo/non_affine_region_2.ll | 2 +- polly/test/ScopInfo/non_affine_region_3.ll | 4 +- polly/test/ScopInfo/non_affine_region_4.ll | 2 +- .../ScopInfo/nonaffine-buildMemoryAccess.ll | 2 +- polly/test/ScopInfo/not-a-reduction.ll | 2 +- polly/test/ScopInfo/opaque-struct.ll | 2 +- ...gion-entry-phi-node-nonaffine-subregion.ll | 2 +- ...ut-of-scop-use-in-region-entry-phi-node.ll | 2 +- .../ScopInfo/parameter-constant-division.ll | 4 +- .../ScopInfo/parameter_in_dead_statement.ll | 6 +- polly/test/ScopInfo/parameter_product.ll | 2 +- .../parameter_with_constant_factor_in_add.ll | 2 +- .../ScopInfo/partially_invariant_load_1.ll | 4 +- .../ScopInfo/partially_invariant_load_2.ll | 2 +- .../test/ScopInfo/phi-in-non-affine-region.ll | 2 +- polly/test/ScopInfo/phi_after_error_block.ll | 2 +- .../test/ScopInfo/phi_condition_modeling_1.ll | 2 +- .../test/ScopInfo/phi_condition_modeling_2.ll | 2 +- .../test/ScopInfo/phi_conditional_simple_1.ll | 2 +- polly/test/ScopInfo/phi_loop_carried_float.ll | 2 +- polly/test/ScopInfo/phi_not_grouped_at_top.ll | 2 +- polly/test/ScopInfo/phi_scalar_simple_1.ll | 2 +- polly/test/ScopInfo/phi_scalar_simple_2.ll | 2 +- polly/test/ScopInfo/phi_with_invoke_edge.ll | 2 +- .../ScopInfo/pointer-comparison-no-nsw.ll | 2 +- polly/test/ScopInfo/pointer-comparison.ll | 2 +- .../test/ScopInfo/pointer-type-expressions.ll | 2 +- ...er-used-as-base-pointer-and-scalar-read.ll | 2 +- .../polly-timeout-parameter-bounds.ll | 2 +- polly/test/ScopInfo/pr38218.ll | 2 +- ...eserve-equiv-class-order-in-basic_block.ll | 2 +- .../test/ScopInfo/process_added_dimensions.ll | 2 +- .../test/ScopInfo/pwaff-complexity-bailout.ll | 2 +- polly/test/ScopInfo/ranged_parameter.ll | 2 +- polly/test/ScopInfo/ranged_parameter_2.ll | 3 +- polly/test/ScopInfo/ranged_parameter_wrap.ll | 2 +- .../test/ScopInfo/ranged_parameter_wrap_2.ll | 2 +- .../read-only-scalar-used-in-phi-2.ll | 2 +- .../ScopInfo/read-only-scalar-used-in-phi.ll | 2 +- polly/test/ScopInfo/read-only-scalars.ll | 4 +- polly/test/ScopInfo/read-only-statements.ll | 2 +- .../ScopInfo/reduction_alternating_base.ll | 2 +- ...uction_chain_partially_outside_the_scop.ll | 2 +- .../ScopInfo/reduction_different_index.ll | 2 +- .../ScopInfo/reduction_different_index1.ll | 2 +- .../reduction_disabled_multiplicative.ll | 2 +- polly/test/ScopInfo/reduction_double.ll | 2 +- .../reduction_escaping_intermediate.ll | 2 +- .../reduction_escaping_intermediate_2.ll | 2 +- .../reduction_escaping_intermediate_3.ll | 2 +- polly/test/ScopInfo/reduction_if.ll | 2 +- .../ScopInfo/reduction_indirect_access.ll | 2 +- .../ScopInfo/reduction_indirect_access_2.ll | 2 +- .../reduction_invalid_different_operators.ll | 2 +- .../reduction_invalid_overlapping_accesses.ll | 2 +- .../reduction_long_reduction_chain.ll | 2 +- ...duction_long_reduction_chain_double_use.ll | 2 +- .../reduction_multiple_different_operators.ll | 2 +- .../reduction_multiple_loops_array_sum.ll | 2 +- .../reduction_multiple_loops_array_sum_1.ll | 2 +- .../reduction_multiple_simple_binary.ll | 2 +- .../reduction_non_overlapping_chains.ll | 2 +- .../reduction_only_reduction_like_access.ll | 2 +- polly/test/ScopInfo/reduction_simple_fp.ll | 2 +- .../ScopInfo/reduction_simple_w_constant.ll | 2 +- polly/test/ScopInfo/reduction_simple_w_iv.ll | 2 +- .../ScopInfo/reduction_two_identical_reads.ll | 4 +- .../redundant_parameter_constraint.ll | 2 +- .../test/ScopInfo/region-with-instructions.ll | 2 +- polly/test/ScopInfo/remarks.ll | 3 +- .../required-invariant-loop-bounds.ll | 3 +- .../ScopInfo/restriction_in_dead_block.ll | 2 +- .../run-time-check-many-array-disjuncts.ll | 5 +- .../run-time-check-many-parameters.ll | 2 +- .../run-time-check-many-piecewise-aliasing.ll | 5 +- .../run-time-check-read-only-arrays.ll | 2 +- .../same-base-address-scalar-and-array.ll | 2 +- polly/test/ScopInfo/scalar.ll | 2 +- .../ScopInfo/scalar_dependence_cond_br.ll | 2 +- polly/test/ScopInfo/scalar_to_array.ll | 4 +- .../scev-div-with-evaluatable-divisor.ll | 2 +- polly/test/ScopInfo/scev-invalidated.ll | 2 +- .../schedule-const-post-dominator-walk-2.ll | 2 +- .../schedule-const-post-dominator-walk.ll | 2 +- .../schedule-constuction-endless-loop1.ll | 2 +- .../schedule-constuction-endless-loop2.ll | 2 +- ...tly-contructed-in-case-of-infinite-loop.ll | 2 +- .../scop-affine-parameter-ordering.ll | 2 +- polly/test/ScopInfo/sign_wrapped_set.ll | 2 +- polly/test/ScopInfo/simple_loop_1.ll | 2 +- polly/test/ScopInfo/simple_loop_2.ll | 2 +- polly/test/ScopInfo/simple_loop_unsigned.ll | 2 +- polly/test/ScopInfo/simple_loop_unsigned_2.ll | 2 +- polly/test/ScopInfo/simple_loop_unsigned_3.ll | 2 +- .../ScopInfo/simple_nonaffine_loop_not.ll | 2 +- polly/test/ScopInfo/smax.ll | 2 +- polly/test/ScopInfo/statistics.ll | 2 +- .../stmt_split_exit_of_region_stmt.ll | 2 +- .../ScopInfo/stmt_split_no_after_split.ll | 2 +- .../test/ScopInfo/stmt_split_no_dependence.ll | 2 +- polly/test/ScopInfo/stmt_split_on_store.ll | 2 +- .../ScopInfo/stmt_split_on_synthesizable.ll | 2 +- .../stmt_split_phi_in_beginning_bb.ll | 2 +- polly/test/ScopInfo/stmt_split_phi_in_stmt.ll | 2 +- .../ScopInfo/stmt_split_scalar_dependence.ll | 2 +- polly/test/ScopInfo/stmt_split_within_loop.ll | 2 +- .../stmt_with_read_but_without_sideffect.ll | 2 +- polly/test/ScopInfo/switch-1.ll | 4 +- polly/test/ScopInfo/switch-2.ll | 4 +- polly/test/ScopInfo/switch-3.ll | 4 +- polly/test/ScopInfo/switch-4.ll | 4 +- polly/test/ScopInfo/switch-5.ll | 4 +- polly/test/ScopInfo/switch-6.ll | 4 +- polly/test/ScopInfo/switch-7.ll | 4 +- polly/test/ScopInfo/tempscop-printing.ll | 2 +- .../ScopInfo/test-wrapping-in-condition.ll | 4 +- polly/test/ScopInfo/truncate-1.ll | 2 +- polly/test/ScopInfo/truncate-2.ll | 2 +- polly/test/ScopInfo/truncate-3.ll | 3 +- polly/test/ScopInfo/two-loops-one-infinite.ll | 2 +- .../two-loops-right-after-each-other.ll | 2 +- polly/test/ScopInfo/undef_in_cond.ll | 2 +- polly/test/ScopInfo/unnamed_nonaffine.ll | 4 +- polly/test/ScopInfo/unnamed_stmts.ll | 2 +- .../ScopInfo/unpredictable_nonscop_loop.ll | 2 +- .../test/ScopInfo/unprofitable_scalar-accs.ll | 4 +- polly/test/ScopInfo/unsigned-condition.ll | 2 +- polly/test/ScopInfo/unsigned-division-1.ll | 2 +- polly/test/ScopInfo/unsigned-division-2.ll | 2 +- polly/test/ScopInfo/unsigned-division-3.ll | 2 +- polly/test/ScopInfo/unsigned-division-4.ll | 2 +- polly/test/ScopInfo/unsigned-division-5.ll | 2 +- polly/test/ScopInfo/unsigned_wrap_uge.ll | 2 +- polly/test/ScopInfo/unsigned_wrap_ugt.ll | 2 +- polly/test/ScopInfo/unsigned_wrap_ule.ll | 2 +- polly/test/ScopInfo/unsigned_wrap_ult.ll | 2 +- polly/test/ScopInfo/user_context.ll | 8 +- ...ed_assumptions-in-bb-signed-conditional.ll | 4 +- .../user_provided_assumptions-in-bb-signed.ll | 2 +- ...ser_provided_assumptions-in-bb-unsigned.ll | 4 +- .../ScopInfo/user_provided_assumptions.ll | 4 +- .../ScopInfo/user_provided_assumptions_2.ll | 4 +- .../ScopInfo/user_provided_assumptions_3.ll | 4 +- ...ser_provided_non_dominating_assumptions.ll | 6 +- polly/test/ScopInfo/variant_base_pointer.ll | 4 +- .../ScopInfo/variant_load_empty_domain.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_0.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_1.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_2.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_3.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_4.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_5.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_6.ll | 2 +- polly/test/ScopInfo/wraping_signed_expr_7.ll | 2 +- .../ScopInfo/wraping_signed_expr_slow_1.ll | 2 +- .../ScopInfo/wraping_signed_expr_slow_2.ll | 2 +- polly/test/ScopInfo/zero_ext_of_truncate.ll | 2 +- polly/test/ScopInfo/zero_ext_of_truncate_2.ll | 2 +- .../test/ScopInfo/zero_ext_space_mismatch.ll | 2 +- polly/test/ScopInliner/ignore-declares.ll | 2 +- polly/test/ScopInliner/invariant-load-func.ll | 2 +- polly/test/ScopInliner/simple-inline-loop.ll | 2 +- polly/test/Simplify/coalesce_3partials.ll | 2 +- .../Simplify/coalesce_disjointelements.ll | 2 +- polly/test/Simplify/coalesce_overlapping.ll | 2 +- polly/test/Simplify/coalesce_partial.ll | 2 +- polly/test/Simplify/dead_access_load.ll | 2 +- polly/test/Simplify/dead_access_phi.ll | 2 +- polly/test/Simplify/dead_access_value.ll | 2 +- polly/test/Simplify/dead_instruction.ll | 2 +- polly/test/Simplify/emptyaccessdomain.ll | 2 +- polly/test/Simplify/exit_phi_accesses-2.ll | 2 +- polly/test/Simplify/func-b320a7.ll | 2 +- polly/test/Simplify/gemm.ll | 2 +- .../Simplify/nocoalesce_differentvalues.ll | 2 +- .../Simplify/nocoalesce_elementmismatch.ll | 2 +- polly/test/Simplify/nocoalesce_readbetween.ll | 2 +- .../test/Simplify/nocoalesce_writebetween.ll | 2 +- polly/test/Simplify/notdead_region_exitphi.ll | 2 +- .../test/Simplify/notdead_region_innerphi.ll | 2 +- .../test/Simplify/notredundant_region_loop.ll | 2 +- .../Simplify/notredundant_region_middle.ll | 2 +- .../notredundant_synthesizable_unknownit.ll | 2 +- ...ut-of-scop-use-in-region-entry-phi-node.ll | 2 +- polly/test/Simplify/overwritten.ll | 2 +- polly/test/Simplify/overwritten_3phi.ll | 2 +- polly/test/Simplify/overwritten_3store.ll | 2 +- .../overwritten_implicit_and_explicit.ll | 2 +- .../test/Simplify/overwritten_loadbetween.ll | 2 +- polly/test/Simplify/overwritten_scalar.ll | 2 +- polly/test/Simplify/pass_existence.ll | 2 +- polly/test/Simplify/phi_in_regionstmt.ll | 2 +- polly/test/Simplify/pr33323.ll | 2 +- polly/test/Simplify/redundant.ll | 2 +- .../test/Simplify/redundant_differentindex.ll | 2 +- polly/test/Simplify/redundant_partialwrite.ll | 2 +- polly/test/Simplify/redundant_region.ll | 2 +- .../test/Simplify/redundant_region_scalar.ll | 2 +- polly/test/Simplify/redundant_scalarwrite.ll | 2 +- polly/test/Simplify/redundant_storebetween.ll | 2 +- polly/test/Simplify/scalability1.ll | 2 +- polly/test/Simplify/scalability2.ll | 2 +- polly/test/Simplify/sweep_mapped_phi.ll | 2 +- polly/test/Simplify/sweep_mapped_value.ll | 2 +- .../Simplify/ununsed_read_in_region_entry.ll | 4 +- polly/test/Support/Plugins.ll | 3 +- polly/test/Support/exportjson.ll | 24 +- polly/test/Support/isl-args.ll | 8 +- polly/test/Support/pipelineposition.ll | 8 +- polly/test/lit.site.cfg.in | 4 + polly/test/polly.ll | 2 +- 1143 files changed, 4460 insertions(+), 2700 deletions(-) create mode 100644 polly/include/polly/LinkAllPasses.h delete mode 100644 polly/include/polly/Pass/PhaseManager.h delete mode 100644 polly/include/polly/Pass/PollyFunctionPass.h delete mode 100644 polly/include/polly/Pass/PollyModulePass.h delete mode 100644 polly/lib/Pass/PhaseManager.cpp delete mode 100644 polly/lib/Pass/PollyFunctionPass.cpp delete mode 100644 polly/lib/Pass/PollyModulePass.cpp delete mode 100644 polly/test/ScopInfo/licm_potential_store_mssa.ll diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst index 215a802843304..f5ea47b69cf02 100644 --- a/polly/docs/ReleaseNotes.rst +++ b/polly/docs/ReleaseNotes.rst @@ -13,7 +13,3 @@ In Polly |version| the following important changes have been incorporated. * ScopInliner has been updated for the New Pass Manager. - * Polly now is a monolithic pass split into phases. - - * Polly's support for the legacy pass manager has been removed. - diff --git a/polly/include/polly/Canonicalization.h b/polly/include/polly/Canonicalization.h index 972b660894a1c..03f277e4e91ba 100644 --- a/polly/include/polly/Canonicalization.h +++ b/polly/include/polly/Canonicalization.h @@ -11,6 +11,12 @@ #include "llvm/Passes/PassBuilder.h" +namespace llvm { +namespace legacy { +class PassManagerBase; +} +} // namespace llvm + namespace polly { /// Schedule a set of canonicalization passes to prepare for Polly. @@ -20,6 +26,8 @@ namespace polly { /// into a canonical form that simplifies the analysis and optimization passes /// of Polly. The set of optimization passes scheduled here is probably not yet /// optimal. TODO: Optimize the set of canonicalization passes. +void registerCanonicalicationPasses(llvm::legacy::PassManagerBase &PM); + llvm::FunctionPassManager buildCanonicalicationPassesForNPM(llvm::ModulePassManager &MPM, llvm::OptimizationLevel Level); diff --git a/polly/include/polly/CodeGen/CodeGeneration.h b/polly/include/polly/CodeGen/CodeGeneration.h index 2340fbe016b49..57aec1d70cc72 100644 --- a/polly/include/polly/CodeGen/CodeGeneration.h +++ b/polly/include/polly/CodeGen/CodeGeneration.h @@ -14,7 +14,6 @@ #include "llvm/IR/PassManager.h" namespace polly { -class IslAstInfo; enum VectorizerChoice { VECTORIZER_NONE, @@ -34,8 +33,6 @@ struct CodeGenerationPass final : PassInfoMixin<CodeGenerationPass> { }; extern bool PerfMonitoring; - -bool runCodeGeneration(Scop &S, llvm::RegionInfo &RI, IslAstInfo &AI); } // namespace polly #endif // POLLY_CODEGENERATION_H diff --git a/polly/include/polly/CodeGen/IslAst.h b/polly/include/polly/CodeGen/IslAst.h index 3e1ff2c8a24da..c99a4957d6b48 100644 --- a/polly/include/polly/CodeGen/IslAst.h +++ b/polly/include/polly/CodeGen/IslAst.h @@ -21,7 +21,6 @@ #ifndef POLLY_ISLAST_H #define POLLY_ISLAST_H -#include "polly/DependenceInfo.h" #include "polly/ScopPass.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/PassManager.h" @@ -173,6 +172,33 @@ struct IslAstAnalysis : AnalysisInfoMixin<IslAstAnalysis> { ScopStandardAnalysisResults &SAR); }; +class IslAstInfoWrapperPass final : public ScopPass { + std::unique_ptr<IslAstInfo> Ast; + +public: + static char ID; + + IslAstInfoWrapperPass() : ScopPass(ID) {} + + IslAstInfo &getAI() { return *Ast; } + const IslAstInfo &getAI() const { return *Ast; } + + /// Build the AST for the given SCoP @p S. + bool runOnScop(Scop &S) override; + + /// Register all analyses and transformation required. + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Release the internal memory. + void releaseMemory() override; + + /// Print a source code representation of the program. + void printScop(raw_ostream &OS, Scop &S) const override; +}; + +llvm::Pass *createIslAstInfoWrapperPassPass(); +llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS); + struct IslAstPrinterPass final : PassInfoMixin<IslAstPrinterPass> { IslAstPrinterPass(raw_ostream &OS) : OS(OS) {} @@ -181,9 +207,11 @@ struct IslAstPrinterPass final : PassInfoMixin<IslAstPrinterPass> { raw_ostream &OS; }; - -std::unique_ptr<IslAstInfo> runIslAstGen(Scop &S, - DependenceAnalysis::Result &DA); } // namespace polly +namespace llvm { +void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &); +void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &); +} // namespace llvm + #endif // POLLY_ISLAST_H diff --git a/polly/include/polly/CodePreparation.h b/polly/include/polly/CodePreparation.h index 1a15e3d4d5a29..c6bc526db209d 100644 --- a/polly/include/polly/CodePreparation.h +++ b/polly/include/polly/CodePreparation.h @@ -15,12 +15,6 @@ #include "llvm/IR/PassManager.h" -namespace llvm { -class DominatorTree; -class LoopInfo; -class RegionInfo; -} // namespace llvm - namespace polly { struct CodePreparationPass final : llvm::PassInfoMixin<CodePreparationPass> { llvm::PreservedAnalyses run(llvm::Function &F, diff --git a/polly/include/polly/DeLICM.h b/polly/include/polly/DeLICM.h index 63fc509e0bd46..0e03c04079480 100644 --- a/polly/include/polly/DeLICM.h +++ b/polly/include/polly/DeLICM.h @@ -21,10 +21,15 @@ #include "isl/isl-noexceptions.h" namespace llvm { +class PassRegistry; +class Pass; class raw_ostream; } // namespace llvm namespace polly { +/// Create a new DeLICM pass instance. +llvm::Pass *createDeLICMWrapperPass(); +llvm::Pass *createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS); struct DeLICMPass final : llvm::PassInfoMixin<DeLICMPass> { DeLICMPass() {} @@ -54,7 +59,11 @@ bool isConflicting(isl::union_set ExistingOccupied, isl::union_map ProposedWrites, llvm::raw_ostream *OS = nullptr, unsigned Indent = 0); -bool runDeLICM(Scop &S); } // namespace polly +namespace llvm { +void initializeDeLICMWrapperPassPass(llvm::PassRegistry &); +void initializeDeLICMPrinterLegacyPassPass(llvm::PassRegistry &); +} // namespace llvm + #endif /* POLLY_DELICM_H */ diff --git a/polly/include/polly/DeadCodeElimination.h b/polly/include/polly/DeadCodeElimination.h index 4d8da56c76eec..d416afa030c56 100644 --- a/polly/include/polly/DeadCodeElimination.h +++ b/polly/include/polly/DeadCodeElimination.h @@ -13,10 +13,16 @@ #ifndef POLLY_DEADCODEELIMINATION_H #define POLLY_DEADCODEELIMINATION_H -#include "polly/DependenceInfo.h" #include "polly/ScopPass.h" +namespace llvm { +class PassRegistry; +class Pass; +class raw_ostream; +} // namespace llvm + namespace polly { +llvm::Pass *createDeadCodeElimWrapperPass(); struct DeadCodeElimPass final : llvm::PassInfoMixin<DeadCodeElimPass> { DeadCodeElimPass() {} @@ -25,7 +31,10 @@ struct DeadCodeElimPass final : llvm::PassInfoMixin<DeadCodeElimPass> { ScopStandardAnalysisResults &SAR, SPMUpdater &U); }; -bool runDeadCodeElim(Scop &S, DependenceAnalysis::Result &DA); } // namespace polly +namespace llvm { +void initializeDeadCodeElimWrapperPassPass(llvm::PassRegistry &); +} // namespace llvm + #endif /* POLLY_DEADCODEELIMINATION_H */ diff --git a/polly/include/polly/DependenceInfo.h b/polly/include/polly/DependenceInfo.h index 88ea468dd5473..d562ad80592f2 100644 --- a/polly/include/polly/DependenceInfo.h +++ b/polly/include/polly/DependenceInfo.h @@ -145,6 +145,7 @@ class Dependences final { friend struct DependenceAnalysis; friend struct DependenceInfoPrinterPass; friend class DependenceInfo; + friend class DependenceInfoWrapperPass; /// Destructor that will free internal objects. ~Dependences() { releaseMemory(); } @@ -191,8 +192,6 @@ class Dependences final { const AnalysisLevel Level; }; -extern Dependences::AnalysisLevel OptAnalysisLevel; - struct DependenceAnalysis final : public AnalysisInfoMixin<DependenceAnalysis> { static AnalysisKey Key; struct Result { @@ -233,7 +232,108 @@ struct DependenceInfoPrinterPass final raw_ostream &OS; }; -DependenceAnalysis::Result runDependenceAnalysis(Scop &S); +class DependenceInfo final : public ScopPass { +public: + static char ID; + + /// Construct a new DependenceInfo pass. + DependenceInfo() : ScopPass(ID) {} + + /// Return the dependence information for the current SCoP. + /// + /// @param Level The granularity of dependence analysis result. + /// + /// @return The dependence analysis result + /// + const Dependences &getDependences(Dependences::AnalysisLevel Level); + + /// Recompute dependences from schedule and memory accesses. + const Dependences &recomputeDependences(Dependences::AnalysisLevel Level); + + /// Invalidate the dependence information and recompute it when needed again. + /// May be required when the underlying Scop was changed in a way that would + /// add new dependencies (e.g. between new statement instances insierted into + /// the SCoP) or intentionally breaks existing ones. It is not required when + /// updating the schedule that conforms the existing dependencies. + void abandonDependences(); + + /// Compute the dependence information for the SCoP @p S. + bool runOnScop(Scop &S) override; + + /// Print the dependences for the given SCoP to @p OS. + void printScop(raw_ostream &OS, Scop &) const override; + + /// Release the internal memory. + void releaseMemory() override { + for (auto &d : D) + d.reset(); + } + + /// Register all analyses and transformation required. + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + Scop *S; + + /// Dependences struct for the current SCoP. + std::unique_ptr<Dependences> D[Dependences::NumAnalysisLevels]; +}; + +llvm::Pass *createDependenceInfoPass(); +llvm::Pass *createDependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS); + +/// Construct a new DependenceInfoWrapper pass. +class DependenceInfoWrapperPass final : public FunctionPass { +public: + static char ID; + + /// Construct a new DependenceInfoWrapper pass. + DependenceInfoWrapperPass() : FunctionPass(ID) {} + + /// Return the dependence information for the given SCoP. + /// + /// @param S SCoP object. + /// @param Level The granularity of dependence analysis result. + /// + /// @return The dependence analysis result + /// + const Dependences &getDependences(Scop *S, Dependences::AnalysisLevel Level); + + /// Recompute dependences from schedule and memory accesses. + const Dependences &recomputeDependences(Scop *S, + Dependences::AnalysisLevel Level); + + /// Compute the dependence information on-the-fly for the function. + bool runOnFunction(Function &F) override; + + /// Print the dependences for the current function to @p OS. + void print(raw_ostream &OS, const Module *M = nullptr) const override; + + /// Release the internal memory. + void releaseMemory() override { ScopToDepsMap.clear(); } + + /// Register all analyses and transformation required. + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + using ScopToDepsMapTy = DenseMap<Scop *, std::unique_ptr<Dependences>>; + + /// Scop to Dependence map for the current function. + ScopToDepsMapTy ScopToDepsMap; +}; + +llvm::Pass *createDependenceInfoWrapperPassPass(); +llvm::Pass * +createDependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS); + } // namespace polly +namespace llvm { +void initializeDependenceInfoPass(llvm::PassRegistry &); +void initializeDependenceInfoPrinterLegacyPassPass(llvm::PassRegistry &); +void initializeDependenceInfoWrapperPassPass(llvm::PassRegistry &); +void initializeDependenceInfoPrinterLegacyFunctionPassPass( + llvm::PassRegistry &); +} // namespace llvm + #endif diff --git a/polly/include/polly/FlattenSchedule.h b/polly/include/polly/FlattenSchedule.h index 154344d2f5c3e..3ef3c304243df 100644 --- a/polly/include/polly/FlattenSchedule.h +++ b/polly/include/polly/FlattenSchedule.h @@ -15,10 +15,20 @@ #ifndef POLLY_FLATTENSCHEDULE_H #define POLLY_FLATTENSCHEDULE_H -namespace polly { -class Scop; +namespace llvm { +class PassRegistry; +class Pass; +class raw_ostream; +} // namespace llvm -void runFlattenSchedulePass(Scop &S); +namespace polly { +llvm::Pass *createFlattenSchedulePass(); +llvm::Pass *createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS); } // namespace polly +namespace llvm { +void initializeFlattenSchedulePass(llvm::PassRegistry &); +void initializeFlattenSchedulePrinterLegacyPassPass(llvm::PassRegistry &); +} // namespace llvm + #endif /* POLLY_FLATTENSCHEDULE_H */ diff --git a/polly/include/polly/ForwardOpTree.h b/polly/include/polly/ForwardOpTree.h index 8b2ece1f08e15..b5da0f513ab78 100644 --- a/polly/include/polly/ForwardOpTree.h +++ b/polly/include/polly/ForwardOpTree.h @@ -15,7 +15,13 @@ #include "polly/ScopPass.h" +namespace llvm { +class PassRegistry; +} // namespace llvm + namespace polly { +llvm::Pass *createForwardOpTreeWrapperPass(); +llvm::Pass *createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS); struct ForwardOpTreePass final : llvm::PassInfoMixin<ForwardOpTreePass> { ForwardOpTreePass() {} @@ -35,15 +41,11 @@ struct ForwardOpTreePrinterPass final llvm::raw_ostream &OS; }; -/// Pass that redirects scalar reads to array elements that are known to contain -/// the same value. -/// -/// This reduces the number of scalar accesses and therefore potentially -/// increases the freedom of the scheduler. In the ideal case, all reads of a -/// scalar definition are redirected (We currently do not care about removing -/// the write in this case). This is also useful for the main DeLICM pass as -/// there are less scalars to be mapped. -bool runForwardOpTree(Scop &S); } // namespace polly +namespace llvm { +void initializeForwardOpTreeWrapperPassPass(PassRegistry &); +void initializeForwardOpTreePrinterLegacyPassPass(PassRegistry &); +} // namespace llvm + #endif // POLLY_FORWARDOPTREE_H diff --git a/polly/include/polly/JSONExporter.h b/polly/include/polly/JSONExporter.h index 82a881c737064..958f95ea11404 100644 --- a/polly/include/polly/JSONExporter.h +++ b/polly/include/polly/JSONExporter.h @@ -9,11 +9,13 @@ #ifndef POLLY_JSONEXPORTER_H #define POLLY_JSONEXPORTER_H -#include "polly/DependenceInfo.h" #include "polly/ScopPass.h" #include "llvm/IR/PassManager.h" namespace polly { +llvm::Pass *createJSONExporterPass(); +llvm::Pass *createJSONImporterPass(); +llvm::Pass *createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS); /// This pass exports a scop to a jscop file. The filename is generated from the /// concatenation of the function and scop name. @@ -28,9 +30,12 @@ struct JSONImportPass final : llvm::PassInfoMixin<JSONExportPass> { llvm::PreservedAnalyses run(Scop &, ScopAnalysisManager &, ScopStandardAnalysisResults &, SPMUpdater &); }; - -void runImportJSON(Scop &S, DependenceAnalysis::Result &DA); -void runExportJSON(Scop &S); } // namespace polly +namespace llvm { +void initializeJSONExporterPass(llvm::PassRegistry &); +void initializeJSONImporterPass(llvm::PassRegistry &); +void initializeJSONImporterPrinterLegacyPassPass(llvm::PassRegistry &); +} // namespace llvm + #endif /* POLLY_JSONEXPORTER_H */ diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h new file mode 100644 index 0000000000000..9978344c73e9f --- /dev/null +++ b/polly/include/polly/LinkAllPasses.h @@ -0,0 +1,156 @@ +//===- polly/LinkAllPasses.h ----------- Reference All Passes ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header file pulls in all transformation and analysis passes for tools +// like opt and bugpoint that need this functionality. +// +//===----------------------------------------------------------------------===// + +#ifndef POLLY_LINKALLPASSES_H +#define POLLY_LINKALLPASSES_H + +#include "polly/Config/config.h" +#include "polly/Support/DumpFunctionPass.h" +#include "polly/Support/DumpModulePass.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/AlwaysTrue.h" + +namespace llvm { +class Pass; +class PassRegistry; +} // namespace llvm + +namespace polly { +llvm::Pass *createCodePreparationPass(); +llvm::Pass *createScopInlinerPass(); +llvm::Pass *createDeadCodeElimWrapperPass(); +llvm::Pass *createDependenceInfoPass(); +llvm::Pass *createDependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS); +llvm::Pass *createDependenceInfoWrapperPassPass(); +llvm::Pass * +createDependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS); +llvm::Pass *createDOTOnlyPrinterWrapperPass(); +llvm::Pass *createDOTOnlyViewerWrapperPass(); +llvm::Pass *createDOTPrinterWrapperPass(); +llvm::Pass *createDOTViewerWrapperPass(); +llvm::Pass *createJSONExporterPass(); +llvm::Pass *createJSONImporterPass(); +llvm::Pass *createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS); +llvm::Pass *createPollyCanonicalizePass(); +llvm::Pass *createScopDetectionWrapperPassPass(); +llvm::Pass *createScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS); +llvm::Pass *createScopInfoRegionPassPass(); +llvm::Pass *createScopInfoPrinterLegacyRegionPass(llvm::raw_ostream &OS); +llvm::Pass *createScopInfoWrapperPassPass(); +llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS); +llvm::Pass *createIslAstInfoWrapperPassPass(); +llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS); +llvm::Pass *createCodeGenerationPass(); +llvm::Pass *createIslScheduleOptimizerWrapperPass(); +llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS); +llvm::Pass *createFlattenSchedulePass(); +llvm::Pass *createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS); +llvm::Pass *createForwardOpTreeWrapperPass(); +llvm::Pass *createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS); +llvm::Pass *createDeLICMWrapperPass(); +llvm::Pass *createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS); +llvm::Pass *createMaximalStaticExpansionPass(); +llvm::Pass *createSimplifyWrapperPass(int); +llvm::Pass *createSimplifyPrinterLegacyPass(llvm::raw_ostream &OS); +llvm::Pass *createPruneUnprofitableWrapperPass(); + +extern char &CodePreparationID; +} // namespace polly + +namespace { +struct PollyForcePassLinking { + PollyForcePassLinking() { + // We must reference the passes in such a way that compilers will not delete + // it all as dead code, even with whole program optimization, yet is + // effectively a NO-OP. + if (llvm::getNonFoldableAlwaysTrue()) + return; + + polly::createCodePreparationPass(); + polly::createDeadCodeElimWrapperPass(); + polly::createDependenceInfoPass(); + polly::createDependenceInfoPrinterLegacyPass(llvm::outs()); + polly::createDependenceInfoWrapperPassPass(); + polly::createDependenceInfoPrinterLegacyFunctionPass(llvm::outs()); + polly::createDOTOnlyPrinterWrapperPass(); + polly::createDOTOnlyViewerWrapperPass(); + polly::createDOTPrinterWrapperPass(); + polly::createDOTViewerWrapperPass(); + polly::createJSONExporterPass(); + polly::createJSONImporterPass(); + polly::createJSONImporterPrinterLegacyPass(llvm::outs()); + polly::createScopDetectionWrapperPassPass(); + polly::createScopDetectionPrinterLegacyPass(llvm::outs()); + polly::createScopInfoRegionPassPass(); + polly::createScopInfoPrinterLegacyRegionPass(llvm::outs()); + polly::createScopInfoWrapperPassPass(); + polly::createScopInfoPrinterLegacyFunctionPass(llvm::outs()); + polly::createPollyCanonicalizePass(); + polly::createIslAstInfoWrapperPassPass(); + polly::createIslAstInfoPrinterLegacyPass(llvm::outs()); + polly::createCodeGenerationPass(); + polly::createIslScheduleOptimizerWrapperPass(); + polly::createIslScheduleOptimizerPrinterLegacyPass(llvm::outs()); + polly::createMaximalStaticExpansionPass(); + polly::createFlattenSchedulePass(); + polly::createFlattenSchedulePrinterLegacyPass(llvm::errs()); + polly::createForwardOpTreeWrapperPass(); + polly::createForwardOpTreePrinterLegacyPass(llvm::errs()); + polly::createDeLICMWrapperPass(); + polly::createDeLICMPrinterLegacyPass(llvm::outs()); + polly::createDumpModuleWrapperPass("", true); + polly::createDumpFunctionWrapperPass(""); + polly::createSimplifyWrapperPass(0); + polly::createSimplifyPrinterLegacyPass(llvm::outs()); + polly::createPruneUnprofitableWrapperPass(); + } +} PollyForcePassLinking; // Force link by creating a global definition. +} // namespace + +namespace llvm { +void initializeCodePreparationPass(llvm::PassRegistry &); +void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &); +void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &); +void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &); +void initializeScopInfoRegionPassPass(PassRegistry &); +void initializeScopInfoPrinterLegacyRegionPassPass(llvm::PassRegistry &); +void initializeScopInfoWrapperPassPass(PassRegistry &); +void initializeScopInfoPrinterLegacyFunctionPassPass(PassRegistry &); +void initializeDeadCodeElimWrapperPassPass(llvm::PassRegistry &); +void initializeJSONExporterPass(llvm::PassRegistry &); +void initializeJSONImporterPass(llvm::PassRegistry &); +void initializeJSONImporterPrinterLegacyPassPass(llvm::PassRegistry &); +void initializeDependenceInfoPass(llvm::PassRegistry &); +void initializeDependenceInfoPrinterLegacyPassPass(llvm::PassRegistry &); +void initializeDependenceInfoWrapperPassPass(llvm::PassRegistry &); +void initializeDependenceInfoPrinterLegacyFunctionPassPass( + llvm::PassRegistry &); +void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &); +void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &); +void initializeCodeGenerationPass(llvm::PassRegistry &); +void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &); +void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &); +void initializeMaximalStaticExpanderWrapperPassPass(llvm::PassRegistry &); +void initializePollyCanonicalizePass(llvm::PassRegistry &); +void initializeFlattenSchedulePass(llvm::PassRegistry &); +void initializeFlattenSchedulePrinterLegacyPassPass(llvm::PassRegistry &); +void initializeForwardOpTreeWrapperPassPass(llvm::PassRegistry &); +void initializeForwardOpTreePrinterLegacyPassPass(PassRegistry &); +void initializeDeLICMWrapperPassPass(llvm::PassRegistry &); +void initializeDeLICMPrinterLegacyPassPass(llvm::PassRegistry &); +void initializeSimplifyWrapperPassPass(llvm::PassRegistry &); +void initializeSimplifyPrinterLegacyPassPass(llvm::PassRegistry &); +void initializePruneUnprofitableWrapperPassPass(llvm::PassRegistry &); +} // namespace llvm + +#endif diff --git a/polly/include/polly/MaximalStaticExpansion.h b/polly/include/polly/MaximalStaticExpansion.h index 1f9fbcb1d6a70..88827b2700887 100644 --- a/polly/include/polly/MaximalStaticExpansion.h +++ b/polly/include/polly/MaximalStaticExpansion.h @@ -14,7 +14,6 @@ #ifndef POLLY_MAXIMALSTATICEXPANSION_H #define POLLY_MAXIMALSTATICEXPANSION_H -#include "polly/DependenceInfo.h" #include "polly/ScopPass.h" #include "llvm/IR/PassManager.h" @@ -38,7 +37,6 @@ struct MaximalStaticExpansionPrinterPass llvm::raw_ostream &OS; }; -void runMaximalStaticExpansion(Scop &S, DependenceAnalysis::Result &DI); } // namespace polly #endif /* POLLY_MAXIMALSTATICEXPANSION_H */ diff --git a/polly/include/polly/Pass/PhaseManager.h b/polly/include/polly/Pass/PhaseManager.h deleted file mode 100644 index 9ff9bbf02d71f..0000000000000 --- a/polly/include/polly/Pass/PhaseManager.h +++ /dev/null @@ -1,127 +0,0 @@ -//===------ PhaseManager.h --------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Implements the sequence of operations on SCoPs, called phases. It is itelf -// not a pass in either pass manager, but used from PollyFunctionPass or -// PollyModulePass. -// -//===----------------------------------------------------------------------===// - -#ifndef POLLY_PASS_PHASEMANAGER_H_ -#define POLLY_PASS_PHASEMANAGER_H_ - -#include "polly/DependenceInfo.h" -#include "llvm/ADT/Bitset.h" -#include <stddef.h> - -namespace llvm { -class Function; -class Error; -} // namespace llvm - -namespace polly { - -/// Phases (in execution order) within the Polly pass. -enum class PassPhase { - None, - - Prepare, - - Detection, - PrintDetect, - DotScops, - DotScopsOnly, - ViewScops, - ViewScopsOnly, - - ScopInfo, - PrintScopInfo, - - Flatten, - - Dependences, - PrintDependences, - - ImportJScop, - Simplify0, - Optree, - DeLICM, - Simplify1, - DeadCodeElimination, - MaximumStaticExtension, - PruneUnprofitable, - Optimization, - ExportJScop, - AstGen, - CodeGen, - - PassPhaseFirst = Prepare, - PassPhaseLast = CodeGen -}; - -StringRef getPhaseName(PassPhase Phase); -PassPhase parsePhase(StringRef Name); -bool dependsOnDependenceInfo(PassPhase Phase); - -/// Options for the Polly pass. -class PollyPassOptions { - /// For each Polly phase, whether it should be executed. - /// Since PassPhase::None is unused, bit positions are shifted by one. - llvm::Bitset<static_cast<size_t>(PassPhase::PassPhaseLast) - - static_cast<size_t>(PassPhase::PassPhaseFirst) + 1> - PhaseEnabled; - -public: - bool ViewAll = false; - std::string ViewFilter; - Dependences::AnalysisLevel PrintDepsAnalysisLevel = Dependences::AL_Statement; - - bool isPhaseEnabled(PassPhase Phase) const { - assert(Phase != PassPhase::None); - unsigned BitPos = static_cast<size_t>(Phase) - - static_cast<size_t>(PassPhase::PassPhaseFirst); - return PhaseEnabled[BitPos]; - } - - void setPhaseEnabled(PassPhase Phase, bool Enabled = true) { - assert(Phase != PassPhase::None); - unsigned BitPos = static_cast<size_t>(Phase) - - static_cast<size_t>(PassPhase::PassPhaseFirst); - if (Enabled) - PhaseEnabled.set(BitPos); - else - PhaseEnabled.reset(BitPos); - } - - /// Enable all phases that are necessary for a roundtrip from LLVM-IR back to - /// LLVM-IR. - void enableEnd2End(); - - /// Enabled the default optimization phases. - void enableDefaultOpts(); - - /// Disable all phases following \p Phase. - /// Useful when regression testing that particular phase and everything after - /// it is not of interest. - void disableAfter(PassPhase Phase); - - /// Check whether the options are coherent relative to each other. - llvm::Error checkConsistency() const; -}; - -/// Run Polly and its phases on \p F. -bool runPollyPass(Function &F, llvm::FunctionAnalysisManager &FAM, - PollyPassOptions Opts); -} // namespace polly - -/// Make llvm::enum_seq<PassPhase> work. -template <> struct llvm::enum_iteration_traits<polly::PassPhase> { - static constexpr bool is_iterable = true; -}; - -#endif /* POLLY_PASS_PHASEMANAGER_H_ */ diff --git a/polly/include/polly/Pass/PollyFunctionPass.h b/polly/include/polly/Pass/PollyFunctionPass.h deleted file mode 100644 index dd0d4e77d7a80..0000000000000 --- a/polly/include/polly/Pass/PollyFunctionPass.h +++ /dev/null @@ -1,32 +0,0 @@ -//===------ PollyFunctionPass.h - Polly function pass ---------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef POLLY_PASS_POLLYFUNCTIONPASS_H_ -#define POLLY_PASS_POLLYFUNCTIONPASS_H_ - -#include "polly/Pass/PhaseManager.h" -#include "llvm/IR/Analysis.h" -#include "llvm/IR/PassManager.h" -#include <utility> - -namespace polly { - -class PollyFunctionPass : public llvm::PassInfoMixin<PollyFunctionPass> { -public: - PollyFunctionPass() {} - PollyFunctionPass(PollyPassOptions Opts) : Opts(std::move(Opts)) {} - - llvm::PreservedAnalyses run(llvm::Function &F, - llvm::FunctionAnalysisManager &); - -private: - PollyPassOptions Opts; -}; -} // namespace polly - -#endif /* POLLY_PASS_POLLYFUNCTIONPASS_H_ */ diff --git a/polly/include/polly/Pass/PollyModulePass.h b/polly/include/polly/Pass/PollyModulePass.h deleted file mode 100644 index 2214bbf3d143e..0000000000000 --- a/polly/include/polly/Pass/PollyModulePass.h +++ /dev/null @@ -1,30 +0,0 @@ -//===------ PollyModulePass.h - Polly module pass -------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef POLLY_PASS_POLLYMODULEPASS_H_ -#define POLLY_PASS_POLLYMODULEPASS_H_ - -#include "polly/Pass/PhaseManager.h" -#include "llvm/IR/PassManager.h" - -namespace polly { - -class PollyModulePass : public llvm::PassInfoMixin<PollyModulePass> { -public: - PollyModulePass() {} - PollyModulePass(PollyPassOptions Opts) : Opts(std::move(Opts)) {} - - llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &); - -private: - PollyPassOptions Opts; -}; - -} // namespace polly - -#endif /* POLLY_PASS_POLLYMODULEPASS_H_ */ diff --git a/polly/include/polly/PruneUnprofitable.h b/polly/include/polly/PruneUnprofitable.h index 16b76cc62f1d2..2d285cce69ad4 100644 --- a/polly/include/polly/PruneUnprofitable.h +++ b/polly/include/polly/PruneUnprofitable.h @@ -15,7 +15,13 @@ #include "polly/ScopPass.h" +namespace llvm { +class Pass; +class PassRegistry; +} // namespace llvm + namespace polly { +llvm::Pass *createPruneUnprofitableWrapperPass(); struct PruneUnprofitablePass final : llvm::PassInfoMixin<PruneUnprofitablePass> { @@ -24,8 +30,10 @@ struct PruneUnprofitablePass final llvm::PreservedAnalyses run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &U); }; - -bool runPruneUnprofitable(Scop &S); } // namespace polly +namespace llvm { +void initializePruneUnprofitableWrapperPassPass(PassRegistry &); +} + #endif // POLLY_PRUNEUNPROFITABLE_H diff --git a/polly/include/polly/RegisterPasses.h b/polly/include/polly/RegisterPasses.h index 7819462cb0c36..3a81e1ba7487d 100644 --- a/polly/include/polly/RegisterPasses.h +++ b/polly/include/polly/RegisterPasses.h @@ -14,6 +14,7 @@ #define POLLY_REGISTER_PASSES_H namespace llvm { +class PassRegistry; class PassBuilder; struct PassPluginLibraryInfo; namespace legacy { @@ -22,6 +23,7 @@ class PassManagerBase; } // namespace llvm namespace polly { +void initializePollyPasses(llvm::PassRegistry &Registry); void registerPollyPasses(llvm::PassBuilder &PB); } // namespace polly diff --git a/polly/include/polly/ScheduleOptimizer.h b/polly/include/polly/ScheduleOptimizer.h index ac45572ba7ed5..3e17eeff49ae3 100644 --- a/polly/include/polly/ScheduleOptimizer.h +++ b/polly/include/polly/ScheduleOptimizer.h @@ -9,10 +9,16 @@ #ifndef POLLY_SCHEDULEOPTIMIZER_H #define POLLY_SCHEDULEOPTIMIZER_H -#include "polly/DependenceInfo.h" #include "polly/ScopPass.h" +namespace llvm { +class Pass; +class PassRegistry; +} // namespace llvm + namespace polly { +llvm::Pass *createIslScheduleOptimizerWrapperPass(); +llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS); struct IslScheduleOptimizerPass final : llvm::PassInfoMixin<IslScheduleOptimizerPass> { @@ -32,9 +38,11 @@ struct IslScheduleOptimizerPrinterPass final private: llvm::raw_ostream &OS; }; - -void runIslScheduleOptimizer(Scop &S, llvm::TargetTransformInfo *TTI, - DependenceAnalysis::Result &Deps); } // namespace polly +namespace llvm { +void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &); +void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &); +} // namespace llvm + #endif // POLLY_SCHEDULEOPTIMIZER_H diff --git a/polly/include/polly/ScopDetection.h b/polly/include/polly/ScopDetection.h index ded1c88206430..5759f75463284 100644 --- a/polly/include/polly/ScopDetection.h +++ b/polly/include/polly/ScopDetection.h @@ -52,6 +52,7 @@ #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Pass.h" #include <set> namespace polly { @@ -67,6 +68,7 @@ using llvm::DenseMap; using llvm::DominatorTree; using llvm::Function; using llvm::FunctionAnalysisManager; +using llvm::FunctionPass; using llvm::IntrinsicInst; using llvm::LoopInfo; using llvm::Module; @@ -629,6 +631,31 @@ struct ScopAnalysisPrinterPass final : PassInfoMixin<ScopAnalysisPrinterPass> { raw_ostream &OS; }; + +class ScopDetectionWrapperPass final : public FunctionPass { + std::unique_ptr<ScopDetection> Result; + +public: + ScopDetectionWrapperPass(); + + /// @name FunctionPass interface + ///@{ + static char ID; + void getAnalysisUsage(AnalysisUsage &AU) const override; + void releaseMemory() override; + bool runOnFunction(Function &F) override; + void print(raw_ostream &OS, const Module *M = nullptr) const override; + ///@} + + ScopDetection &getSD() const { return *Result; } +}; + +llvm::Pass *createScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS); } // namespace polly +namespace llvm { +void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &); +void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &); +} // namespace llvm + #endif // POLLY_SCOPDETECTION_H diff --git a/polly/include/polly/ScopGraphPrinter.h b/polly/include/polly/ScopGraphPrinter.h index c4e669f0c3503..b57732ad3d70d 100644 --- a/polly/include/polly/ScopGraphPrinter.h +++ b/polly/include/polly/ScopGraphPrinter.h @@ -70,9 +70,6 @@ struct DOTGraphTraits<polly::ScopDetection *> : DOTGraphTraits<RegionNode *> { namespace polly { -extern std::string ViewFilter; -extern bool ViewAll; - struct ScopViewer final : llvm::DOTGraphTraitsViewer<ScopAnalysis, false> { ScopViewer() : llvm::DOTGraphTraitsViewer<ScopAnalysis, false>("scops") {} diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h index 7541ddc21e39f..f700144165d53 100644 --- a/polly/include/polly/ScopInfo.h +++ b/polly/include/polly/ScopInfo.h @@ -23,11 +23,13 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/RegionPass.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/Pass.h" #include "isl/isl-noexceptions.h" #include <cassert> #include <cstddef> @@ -53,6 +55,8 @@ using llvm::MemIntrinsic; using llvm::PassInfoMixin; using llvm::PHINode; using llvm::RegionNode; +using llvm::RegionPass; +using llvm::RGPassManager; using llvm::SetVector; using llvm::SmallPtrSetImpl; using llvm::SmallVector; @@ -2670,6 +2674,39 @@ class Scop final { /// Print Scop scop to raw_ostream OS. raw_ostream &operator<<(raw_ostream &OS, const Scop &scop); +/// The legacy pass manager's analysis pass to compute scop information +/// for a region. +class ScopInfoRegionPass final : public RegionPass { + /// The Scop pointer which is used to construct a Scop. + std::unique_ptr<Scop> S; + +public: + static char ID; // Pass identification, replacement for typeid + + ScopInfoRegionPass() : RegionPass(ID) {} + ~ScopInfoRegionPass() override = default; + + /// Build Scop object, the Polly IR of static control + /// part for the current SESE-Region. + /// + /// @return If the current region is a valid for a static control part, + /// return the Polly IR representing this static control part, + /// return null otherwise. + Scop *getScop() { return S.get(); } + const Scop *getScop() const { return S.get(); } + + /// Calculate the polyhedral scop information for a given Region. + bool runOnRegion(Region *R, RGPassManager &RGM) override; + + void releaseMemory() override { S.reset(); } + + void print(raw_ostream &O, const Module *M = nullptr) const override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +llvm::Pass *createScopInfoPrinterLegacyRegionPass(raw_ostream &OS); + class ScopInfo { public: using RegionToScopMapTy = MapVector<Region *, std::unique_ptr<Scop>>; @@ -2744,6 +2781,45 @@ struct ScopInfoPrinterPass final : PassInfoMixin<ScopInfoPrinterPass> { raw_ostream &Stream; }; + +//===----------------------------------------------------------------------===// +/// The legacy pass manager's analysis pass to compute scop information +/// for the whole function. +/// +/// This pass will maintain a map of the maximal region within a scop to its +/// scop object for all the feasible scops present in a function. +/// This pass is an alternative to the ScopInfoRegionPass in order to avoid a +/// region pass manager. +class ScopInfoWrapperPass final : public FunctionPass { + std::unique_ptr<ScopInfo> Result; + +public: + ScopInfoWrapperPass() : FunctionPass(ID) {} + ~ScopInfoWrapperPass() override = default; + + static char ID; // Pass identification, replacement for typeid + + ScopInfo *getSI() { return Result.get(); } + const ScopInfo *getSI() const { return Result.get(); } + + /// Calculate all the polyhedral scops for a given function. + bool runOnFunction(Function &F) override; + + void releaseMemory() override { Result.reset(); } + + void print(raw_ostream &O, const Module *M = nullptr) const override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS); } // end namespace polly +namespace llvm { +void initializeScopInfoRegionPassPass(PassRegistry &); +void initializeScopInfoPrinterLegacyRegionPassPass(PassRegistry &); +void initializeScopInfoWrapperPassPass(PassRegistry &); +void initializeScopInfoPrinterLegacyFunctionPassPass(PassRegistry &); +} // end namespace llvm + #endif // POLLY_SCOPINFO_H diff --git a/polly/include/polly/ScopInliner.h b/polly/include/polly/ScopInliner.h index ae1938f03ac70..014667804330f 100644 --- a/polly/include/polly/ScopInliner.h +++ b/polly/include/polly/ScopInliner.h @@ -23,6 +23,12 @@ class ScopInlinerPass : public llvm::PassInfoMixin<ScopInlinerPass> { llvm::LazyCallGraph &CG, llvm::CGSCCUpdateResult &UR); }; + +llvm::Pass *createScopInlinerWrapperPass(); } // namespace polly +namespace llvm { +void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &); +} + #endif /* POLLY_POLLYINLINER_H */ diff --git a/polly/include/polly/ScopPass.h b/polly/include/polly/ScopPass.h index 80ccd5717f96c..144cfd1364393 100644 --- a/polly/include/polly/ScopPass.h +++ b/polly/include/polly/ScopPass.h @@ -19,6 +19,7 @@ #include "polly/ScopInfo.h" #include "llvm/ADT/PriorityWorklist.h" +#include "llvm/Analysis/RegionPass.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PassManagerImpl.h" @@ -154,6 +155,33 @@ using ScopPassManager = PassManager<Scop, ScopAnalysisManager, ScopStandardAnalysisResults &, SPMUpdater &>; +/// ScopPass - This class adapts the RegionPass interface to allow convenient +/// creation of passes that operate on the Polly IR. Instead of overriding +/// runOnRegion, subclasses override runOnScop. +class ScopPass : public RegionPass { + Scop *S; + +protected: + explicit ScopPass(char &ID) : RegionPass(ID), S(nullptr) {} + + /// runOnScop - This method must be overloaded to perform the + /// desired Polyhedral transformation or analysis. + /// + virtual bool runOnScop(Scop &S) = 0; + + /// Print method for SCoPs. + virtual void printScop(raw_ostream &OS, Scop &S) const {} + + /// getAnalysisUsage - Subclasses that override getAnalysisUsage + /// must call this. + /// + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + bool runOnRegion(Region *R, RGPassManager &RGM) override; + void print(raw_ostream &OS, const Module *) const override; +}; + struct ScopStandardAnalysisResults { DominatorTree &DT; ScopInfo &SI; diff --git a/polly/include/polly/Simplify.h b/polly/include/polly/Simplify.h index 4565eb26edaf0..b2aa58d850fae 100644 --- a/polly/include/polly/Simplify.h +++ b/polly/include/polly/Simplify.h @@ -16,6 +16,11 @@ #include "polly/ScopPass.h" #include "llvm/ADT/SmallVector.h" +namespace llvm { +class PassRegistry; +class Pass; +} // namespace llvm + namespace polly { class MemoryAccess; class ScopStmt; @@ -36,6 +41,17 @@ class ScopStmt; /// undefined. llvm::SmallVector<MemoryAccess *, 32> getAccessesInOrder(ScopStmt &Stmt); +/// Create a Simplify pass +/// +/// @param CallNo Disambiguates this instance for when there are multiple +/// instances of this pass in the pass manager. It is used only to +/// keep the statistics apart and has no influence on the +/// simplification itself. +/// +/// @return The Simplify pass. +llvm::Pass *createSimplifyWrapperPass(int CallNo = 0); +llvm::Pass *createSimplifyPrinterLegacyPass(llvm::raw_ostream &OS); + struct SimplifyPass final : PassInfoMixin<SimplifyPass> { SimplifyPass(int CallNo = 0) : CallNo(CallNo) {} @@ -57,8 +73,11 @@ struct SimplifyPrinterPass final : PassInfoMixin<SimplifyPrinterPass> { raw_ostream &OS; int CallNo; }; - -bool runSimplify(Scop &S, int CallNo); } // namespace polly +namespace llvm { +void initializeSimplifyWrapperPassPass(llvm::PassRegistry &); +void initializeSimplifyPrinterLegacyPassPass(llvm::PassRegistry &); +} // namespace llvm + #endif /* POLLY_TRANSFORM_SIMPLIFY_H */ diff --git a/polly/include/polly/Support/DumpFunctionPass.h b/polly/include/polly/Support/DumpFunctionPass.h index af04912ed4fe2..e5c16203adb8f 100644 --- a/polly/include/polly/Support/DumpFunctionPass.h +++ b/polly/include/polly/Support/DumpFunctionPass.h @@ -16,7 +16,13 @@ #include "llvm/IR/PassManager.h" #include <string> +namespace llvm { +class FunctionPass; +class ModulePass; +} // namespace llvm + namespace polly { +llvm::FunctionPass *createDumpFunctionWrapperPass(std::string Suffix); /// A pass that isolates a function into a new Module and writes it into a file. struct DumpFunctionPass final : llvm::PassInfoMixin<DumpFunctionPass> { @@ -27,6 +33,12 @@ struct DumpFunctionPass final : llvm::PassInfoMixin<DumpFunctionPass> { llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &AM); }; + } // namespace polly +namespace llvm { +class PassRegistry; +void initializeDumpFunctionWrapperPassPass(llvm::PassRegistry &); +} // namespace llvm + #endif /* POLLY_SUPPORT_DUMPFUNCTIONPASS_H */ diff --git a/polly/include/polly/Support/DumpModulePass.h b/polly/include/polly/Support/DumpModulePass.h index 6d393a174b19b..c90bbc2484310 100644 --- a/polly/include/polly/Support/DumpModulePass.h +++ b/polly/include/polly/Support/DumpModulePass.h @@ -16,8 +16,12 @@ #include "llvm/IR/PassManager.h" #include <string> +namespace llvm { +class ModulePass; +} // namespace llvm + namespace polly { -/// A pass that prints the module into a file. +/// Create a pass that prints the module into a file. /// /// The meaning of @p Filename depends on @p IsSuffix. If IsSuffix==false, then /// the module is written to the @p Filename. If it is true, the filename is @@ -26,6 +30,10 @@ namespace polly { /// The intent of IsSuffix is to avoid the file being overwritten when /// processing multiple modules and/or with multiple dump passes in the /// pipeline. +llvm::ModulePass *createDumpModuleWrapperPass(std::string Filename, + bool IsSuffix); + +/// A pass that prints the module into a file. struct DumpModulePass final : llvm::PassInfoMixin<DumpModulePass> { std::string Filename; bool IsSuffix; @@ -38,4 +46,9 @@ struct DumpModulePass final : llvm::PassInfoMixin<DumpModulePass> { } // namespace polly +namespace llvm { +class PassRegistry; +void initializeDumpModuleWrapperPassPass(llvm::PassRegistry &); +} // namespace llvm + #endif /* POLLY_SUPPORT_DUMPMODULEPASS_H */ diff --git a/polly/include/polly/Support/ScopHelper.h b/polly/include/polly/Support/ScopHelper.h index 38b731a9f7d8d..75891525ff7b3 100644 --- a/polly/include/polly/Support/ScopHelper.h +++ b/polly/include/polly/Support/ScopHelper.h @@ -358,6 +358,14 @@ namespace polly { void simplifyRegion(llvm::Region *R, llvm::DominatorTree *DT, llvm::LoopInfo *LI, llvm::RegionInfo *RI); +/// Split the entry block of a function to store the newly inserted +/// allocations outside of all Scops. +/// +/// @param EntryBlock The entry block of the current function. +/// @param P The pass that currently running. +/// +void splitEntryBlockForAlloca(llvm::BasicBlock *EntryBlock, llvm::Pass *P); + /// Split the entry block of a function to store the newly inserted /// allocations outside of all Scops. /// diff --git a/polly/lib/Analysis/DependenceInfo.cpp b/polly/lib/Analysis/DependenceInfo.cpp index 5183fc5725ece..c620f40ad0724 100644 --- a/polly/lib/Analysis/DependenceInfo.cpp +++ b/polly/lib/Analysis/DependenceInfo.cpp @@ -20,6 +20,7 @@ //===----------------------------------------------------------------------===// // #include "polly/DependenceInfo.h" +#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/Support/GICHelper.h" @@ -41,10 +42,6 @@ using namespace llvm; #include "polly/Support/PollyDebug.h" #define DEBUG_TYPE "polly-dependence" -namespace polly { -Dependences::AnalysisLevel OptAnalysisLevel; -} - static cl::opt<int> OptComputeOut( "polly-dependences-computeout", cl::desc("Bound the dependence analysis by a maximal amount of " @@ -72,10 +69,9 @@ static cl::opt<enum AnalysisType> OptAnalysisType( "Overapproximation of dependences")), cl::Hidden, cl::init(VALUE_BASED_ANALYSIS), cl::cat(PollyCategory)); -static cl::opt<Dependences::AnalysisLevel, true> XOptAnalysisLevel( +static cl::opt<Dependences::AnalysisLevel> OptAnalysisLevel( "polly-dependences-analysis-level", cl::desc("The level of dependence analysis"), - cl::location(OptAnalysisLevel), cl::values(clEnumValN(Dependences::AL_Statement, "statement-wise", "Statement-level analysis"), clEnumValN(Dependences::AL_Reference, "reference-wise", @@ -885,7 +881,213 @@ DependenceInfoPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, return PreservedAnalyses::all(); } -DependenceAnalysis::Result polly::runDependenceAnalysis(Scop &S) { - DependenceAnalysis::Result Result{S, {}}; - return Result; +const Dependences & +DependenceInfo::getDependences(Dependences::AnalysisLevel Level) { + if (Dependences *d = D[Level].get()) + return *d; + + return recomputeDependences(Level); +} + +const Dependences & +DependenceInfo::recomputeDependences(Dependences::AnalysisLevel Level) { + D[Level].reset(new Dependences(S->getSharedIslCtx(), Level)); + D[Level]->calculateDependences(*S); + return *D[Level]; +} + +void DependenceInfo::abandonDependences() { + for (std::unique_ptr<Dependences> &Deps : D) + Deps.release(); +} + +bool DependenceInfo::runOnScop(Scop &ScopVar) { + S = &ScopVar; + return false; +} + +/// Print the dependences for the given SCoP to @p OS. + +void polly::DependenceInfo::printScop(raw_ostream &OS, Scop &S) const { + if (auto d = D[OptAnalysisLevel].get()) { + d->print(OS); + return; + } + + // Otherwise create the dependences on-the-fly and print it + Dependences D(S.getSharedIslCtx(), OptAnalysisLevel); + D.calculateDependences(S); + D.print(OS); +} + +void DependenceInfo::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredTransitive<ScopInfoRegionPass>(); + AU.setPreservesAll(); +} + +char DependenceInfo::ID = 0; + +Pass *polly::createDependenceInfoPass() { return new DependenceInfo(); } + +INITIALIZE_PASS_BEGIN(DependenceInfo, "polly-dependences", + "Polly - Calculate dependences", false, false); +INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); +INITIALIZE_PASS_END(DependenceInfo, "polly-dependences", + "Polly - Calculate dependences", false, false) + +//===----------------------------------------------------------------------===// + +namespace { +/// Print result from DependenceAnalysis. +class DependenceInfoPrinterLegacyPass final : public ScopPass { +public: + static char ID; + + DependenceInfoPrinterLegacyPass() : DependenceInfoPrinterLegacyPass(outs()) {} + + explicit DependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS) + : ScopPass(ID), OS(OS) {} + + bool runOnScop(Scop &S) override { + DependenceInfo &P = getAnalysis<DependenceInfo>(); + + OS << "Printing analysis '" << P.getPassName() << "' for " + << "region: '" << S.getRegion().getNameStr() << "' in function '" + << S.getFunction().getName() << "':\n"; + P.printScop(OS, S); + + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<DependenceInfo>(); + AU.setPreservesAll(); + } + +private: + llvm::raw_ostream &OS; +}; + +char DependenceInfoPrinterLegacyPass::ID = 0; +} // namespace + +Pass *polly::createDependenceInfoPrinterLegacyPass(raw_ostream &OS) { + return new DependenceInfoPrinterLegacyPass(OS); +} + +INITIALIZE_PASS_BEGIN(DependenceInfoPrinterLegacyPass, + "polly-print-dependences", "Polly - Print dependences", + false, false); +INITIALIZE_PASS_DEPENDENCY(DependenceInfo); +INITIALIZE_PASS_END(DependenceInfoPrinterLegacyPass, "polly-print-dependences", + "Polly - Print dependences", false, false) + +//===----------------------------------------------------------------------===// + +const Dependences & +DependenceInfoWrapperPass::getDependences(Scop *S, + Dependences::AnalysisLevel Level) { + auto It = ScopToDepsMap.find(S); + if (It != ScopToDepsMap.end()) + if (It->second) { + if (It->second->getDependenceLevel() == Level) + return *It->second; + } + return recomputeDependences(S, Level); +} + +const Dependences &DependenceInfoWrapperPass::recomputeDependences( + Scop *S, Dependences::AnalysisLevel Level) { + std::unique_ptr<Dependences> D(new Dependences(S->getSharedIslCtx(), Level)); + D->calculateDependences(*S); + auto Inserted = ScopToDepsMap.insert(std::make_pair(S, std::move(D))); + return *Inserted.first->second; } + +bool DependenceInfoWrapperPass::runOnFunction(Function &F) { + auto &SI = *getAnalysis<ScopInfoWrapperPass>().getSI(); + for (auto &It : SI) { + assert(It.second && "Invalid SCoP object!"); + recomputeDependences(It.second.get(), Dependences::AL_Access); + } + return false; +} + +void DependenceInfoWrapperPass::print(raw_ostream &OS, const Module *M) const { + for (auto &It : ScopToDepsMap) { + assert((It.first && It.second) && "Invalid Scop or Dependence object!\n"); + It.second->print(OS); + } +} + +void DependenceInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredTransitive<ScopInfoWrapperPass>(); + AU.setPreservesAll(); +} + +char DependenceInfoWrapperPass::ID = 0; + +Pass *polly::createDependenceInfoWrapperPassPass() { + return new DependenceInfoWrapperPass(); +} + +INITIALIZE_PASS_BEGIN( + DependenceInfoWrapperPass, "polly-function-dependences", + "Polly - Calculate dependences for all the SCoPs of a function", false, + false) +INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass); +INITIALIZE_PASS_END( + DependenceInfoWrapperPass, "polly-function-dependences", + "Polly - Calculate dependences for all the SCoPs of a function", false, + false) + +//===----------------------------------------------------------------------===// + +namespace { +/// Print result from DependenceInfoWrapperPass. +class DependenceInfoPrinterLegacyFunctionPass final : public FunctionPass { +public: + static char ID; + + DependenceInfoPrinterLegacyFunctionPass() + : DependenceInfoPrinterLegacyFunctionPass(outs()) {} + + explicit DependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS) + : FunctionPass(ID), OS(OS) {} + + bool runOnFunction(Function &F) override { + DependenceInfoWrapperPass &P = getAnalysis<DependenceInfoWrapperPass>(); + + OS << "Printing analysis '" << P.getPassName() << "' for function '" + << F.getName() << "':\n"; + P.print(OS); + + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + FunctionPass::getAnalysisUsage(AU); + AU.addRequired<DependenceInfoWrapperPass>(); + AU.setPreservesAll(); + } + +private: + llvm::raw_ostream &OS; +}; + +char DependenceInfoPrinterLegacyFunctionPass::ID = 0; +} // namespace + +Pass *polly::createDependenceInfoPrinterLegacyFunctionPass(raw_ostream &OS) { + return new DependenceInfoPrinterLegacyFunctionPass(OS); +} + +INITIALIZE_PASS_BEGIN( + DependenceInfoPrinterLegacyFunctionPass, "polly-print-function-dependences", + "Polly - Print dependences for all the SCoPs of a function", false, false); +INITIALIZE_PASS_DEPENDENCY(DependenceInfoWrapperPass); +INITIALIZE_PASS_END(DependenceInfoPrinterLegacyFunctionPass, + "polly-print-function-dependences", + "Polly - Print dependences for all the SCoPs of a function", + false, false) diff --git a/polly/lib/Analysis/PruneUnprofitable.cpp b/polly/lib/Analysis/PruneUnprofitable.cpp index 40cc9178da0f3..f8469c03fe55b 100644 --- a/polly/lib/Analysis/PruneUnprofitable.cpp +++ b/polly/lib/Analysis/PruneUnprofitable.cpp @@ -55,9 +55,8 @@ static void updateStatistics(Scop &S, bool Pruned) { NumAffineLoops += ScopStats.NumAffineLoops; } } -} // namespace -bool polly::runPruneUnprofitable(Scop &S) { +static bool runPruneUnprofitable(Scop &S) { if (PollyProcessUnprofitable) { POLLY_DEBUG( dbgs() << "NOTE: -polly-process-unprofitable active, won't prune " @@ -80,6 +79,35 @@ bool polly::runPruneUnprofitable(Scop &S) { return false; } +class PruneUnprofitableWrapperPass final : public ScopPass { +public: + static char ID; + + explicit PruneUnprofitableWrapperPass() : ScopPass(ID) {} + PruneUnprofitableWrapperPass(const PruneUnprofitableWrapperPass &) = delete; + PruneUnprofitableWrapperPass & + operator=(const PruneUnprofitableWrapperPass &) = delete; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScopInfoRegionPass>(); + AU.setPreservesAll(); + } + + bool runOnScop(Scop &S) override { return runPruneUnprofitable(S); } +}; +} // namespace + +char PruneUnprofitableWrapperPass::ID; + +Pass *polly::createPruneUnprofitableWrapperPass() { + return new PruneUnprofitableWrapperPass(); +} + +INITIALIZE_PASS_BEGIN(PruneUnprofitableWrapperPass, "polly-prune-unprofitable", + "Polly - Prune unprofitable SCoPs", false, false) +INITIALIZE_PASS_END(PruneUnprofitableWrapperPass, "polly-prune-unprofitable", + "Polly - Prune unprofitable SCoPs", false, false) + llvm::PreservedAnalyses PruneUnprofitablePass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &U) { diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp index 60a1e00916750..67a4c43455809 100644 --- a/polly/lib/Analysis/ScopBuilder.cpp +++ b/polly/lib/Analysis/ScopBuilder.cpp @@ -56,7 +56,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <cassert> -#include <deque> using namespace llvm; using namespace polly; diff --git a/polly/lib/Analysis/ScopDetection.cpp b/polly/lib/Analysis/ScopDetection.cpp index 29e89348125f2..43ed8636b054b 100644 --- a/polly/lib/Analysis/ScopDetection.cpp +++ b/polly/lib/Analysis/ScopDetection.cpp @@ -44,6 +44,7 @@ //===----------------------------------------------------------------------===// #include "polly/ScopDetection.h" +#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopDetectionDiagnostic.h" #include "polly/Support/SCEVValidator.h" @@ -74,6 +75,8 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Regex.h" #include "llvm/Support/raw_ostream.h" @@ -1980,12 +1983,53 @@ void ScopDetection::verifyAnalysis() { verifyRegion(*R); } +bool ScopDetectionWrapperPass::runOnFunction(Function &F) { + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &RI = getAnalysis<RegionInfoPass>().getRegionInfo(); + auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + + Result = std::make_unique<ScopDetection>(DT, SE, LI, RI, AA, ORE); + Result->detect(F); + return false; +} + +void ScopDetectionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequiredTransitive<ScalarEvolutionWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + // We also need AA and RegionInfo when we are verifying analysis. + AU.addRequiredTransitive<AAResultsWrapperPass>(); + AU.addRequiredTransitive<RegionInfoPass>(); + AU.setPreservesAll(); +} + +void ScopDetectionWrapperPass::print(raw_ostream &OS, const Module *) const { + for (const Region *R : Result->ValidRegions) + OS << "Valid Region for Scop: " << R->getNameStr() << '\n'; + + OS << "\n"; +} + +ScopDetectionWrapperPass::ScopDetectionWrapperPass() : FunctionPass(ID) { + // Disable runtime alias checks if we ignore aliasing all together. + if (IgnoreAliasing) + PollyUseRuntimeAliasChecks = false; +} + ScopAnalysis::ScopAnalysis() { // Disable runtime alias checks if we ignore aliasing all together. if (IgnoreAliasing) PollyUseRuntimeAliasChecks = false; } +void ScopDetectionWrapperPass::releaseMemory() { Result.reset(); } + +char ScopDetectionWrapperPass::ID; + AnalysisKey ScopAnalysis::Key; ScopDetection ScopAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { @@ -2011,3 +2055,66 @@ PreservedAnalyses ScopAnalysisPrinterPass::run(Function &F, OS << "\n"; return PreservedAnalyses::all(); } + +Pass *polly::createScopDetectionWrapperPassPass() { + return new ScopDetectionWrapperPass(); +} + +INITIALIZE_PASS_BEGIN(ScopDetectionWrapperPass, "polly-detect", + "Polly - Detect static control parts (SCoPs)", false, + false); +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass); +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); +INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); +INITIALIZE_PASS_END(ScopDetectionWrapperPass, "polly-detect", + "Polly - Detect static control parts (SCoPs)", false, false) + +//===----------------------------------------------------------------------===// + +namespace { +/// Print result from ScopDetectionWrapperPass. +class ScopDetectionPrinterLegacyPass final : public FunctionPass { +public: + static char ID; + + ScopDetectionPrinterLegacyPass() : ScopDetectionPrinterLegacyPass(outs()) {} + + explicit ScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS) + : FunctionPass(ID), OS(OS) {} + + bool runOnFunction(Function &F) override { + ScopDetectionWrapperPass &P = getAnalysis<ScopDetectionWrapperPass>(); + + OS << "Printing analysis '" << P.getPassName() << "' for function '" + << F.getName() << "':\n"; + P.print(OS); + + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + FunctionPass::getAnalysisUsage(AU); + AU.addRequired<ScopDetectionWrapperPass>(); + AU.setPreservesAll(); + } + +private: + llvm::raw_ostream &OS; +}; + +char ScopDetectionPrinterLegacyPass::ID = 0; +} // namespace + +Pass *polly::createScopDetectionPrinterLegacyPass(raw_ostream &OS) { + return new ScopDetectionPrinterLegacyPass(OS); +} + +INITIALIZE_PASS_BEGIN(ScopDetectionPrinterLegacyPass, "polly-print-detect", + "Polly - Print static control parts (SCoPs)", false, + false); +INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); +INITIALIZE_PASS_END(ScopDetectionPrinterLegacyPass, "polly-print-detect", + "Polly - Print static control parts (SCoPs)", false, false) diff --git a/polly/lib/Analysis/ScopGraphPrinter.cpp b/polly/lib/Analysis/ScopGraphPrinter.cpp index 29e212882cefe..eb6c995f0bb91 100644 --- a/polly/lib/Analysis/ScopGraphPrinter.cpp +++ b/polly/lib/Analysis/ScopGraphPrinter.cpp @@ -14,26 +14,20 @@ //===----------------------------------------------------------------------===// #include "polly/ScopGraphPrinter.h" +#include "polly/LinkAllPasses.h" #include "polly/ScopDetection.h" #include "llvm/Support/CommandLine.h" using namespace polly; using namespace llvm; +static cl::opt<std::string> + ViewFilter("polly-view-only", + cl::desc("Only view functions that match this pattern"), + cl::Hidden, cl::init("")); -namespace polly { -std::string ViewFilter; -bool ViewAll; -} // namespace polly - -static cl::opt<std::string, true> - XViewFilter("polly-view-only", - cl::desc("Only view functions that match this pattern"), - cl::location(ViewFilter), cl::Hidden, cl::init("")); - -static cl::opt<bool, true> - XViewAll("polly-view-all", - cl::desc("Also show functions without any scops"), - cl::location(ViewAll), cl::Hidden, cl::init(false)); +static cl::opt<bool> ViewAll("polly-view-all", + cl::desc("Also show functions without any scops"), + cl::Hidden, cl::init(false)); namespace llvm { @@ -140,6 +134,104 @@ void DOTGraphTraits<ScopDetection *>::addCustomGraphFeatures( } // namespace llvm +struct ScopDetectionAnalysisGraphTraits { + static ScopDetection *getGraph(ScopDetectionWrapperPass *Analysis) { + return &Analysis->getSD(); + } +}; + +struct ScopViewerWrapperPass + : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false, + ScopDetection *, + ScopDetectionAnalysisGraphTraits> { + static char ID; + ScopViewerWrapperPass() + : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false, + ScopDetection *, + ScopDetectionAnalysisGraphTraits>( + "scops", ID) {} + bool processFunction(Function &F, ScopDetectionWrapperPass &SD) override { + if (ViewFilter != "" && !F.getName().count(ViewFilter)) + return false; + + if (ViewAll) + return true; + + // Check that at least one scop was detected. + return std::distance(SD.getSD().begin(), SD.getSD().end()) > 0; + } +}; +char ScopViewerWrapperPass::ID = 0; + +struct ScopOnlyViewerWrapperPass + : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false, + ScopDetection *, + ScopDetectionAnalysisGraphTraits> { + static char ID; + ScopOnlyViewerWrapperPass() + : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false, + ScopDetection *, + ScopDetectionAnalysisGraphTraits>( + "scopsonly", ID) {} +}; +char ScopOnlyViewerWrapperPass::ID = 0; + +struct ScopPrinterWrapperPass + : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, false, + ScopDetection *, + ScopDetectionAnalysisGraphTraits> { + static char ID; + ScopPrinterWrapperPass() + : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, false, + ScopDetection *, + ScopDetectionAnalysisGraphTraits>( + "scops", ID) {} +}; +char ScopPrinterWrapperPass::ID = 0; + +struct ScopOnlyPrinterWrapperPass + : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, true, + ScopDetection *, + ScopDetectionAnalysisGraphTraits> { + static char ID; + ScopOnlyPrinterWrapperPass() + : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, true, + ScopDetection *, + ScopDetectionAnalysisGraphTraits>( + "scopsonly", ID) {} +}; +char ScopOnlyPrinterWrapperPass::ID = 0; + +static RegisterPass<ScopViewerWrapperPass> X("view-scops", + "Polly - View Scops of function"); + +static RegisterPass<ScopOnlyViewerWrapperPass> + Y("view-scops-only", + "Polly - View Scops of function (with no function bodies)"); + +static RegisterPass<ScopPrinterWrapperPass> + M("dot-scops", "Polly - Print Scops of function"); + +static RegisterPass<ScopOnlyPrinterWrapperPass> + N("dot-scops-only", + "Polly - Print Scops of function (with no function bodies)"); + +Pass *polly::createDOTViewerWrapperPass() { + return new ScopViewerWrapperPass(); +} + +Pass *polly::createDOTOnlyViewerWrapperPass() { + return new ScopOnlyViewerWrapperPass(); +} + +Pass *polly::createDOTPrinterWrapperPass() { + return new ScopPrinterWrapperPass(); +} + +Pass *polly::createDOTOnlyPrinterWrapperPass() { + return new ScopOnlyPrinterWrapperPass(); +} + bool ScopViewer::processFunction(Function &F, const ScopDetection &SD) { if (ViewFilter != "" && !F.getName().count(ViewFilter)) return false; diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp index 70e184d3f897f..8c6a2360a249b 100644 --- a/polly/lib/Analysis/ScopInfo.cpp +++ b/polly/lib/Analysis/ScopInfo.cpp @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "polly/ScopInfo.h" +#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopBuilder.h" #include "polly/ScopDetection.h" @@ -56,6 +57,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -2542,6 +2544,19 @@ raw_ostream &polly::operator<<(raw_ostream &OS, const Scop &scop) { return OS; } +//===----------------------------------------------------------------------===// +void ScopInfoRegionPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<RegionInfoPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequiredTransitive<ScalarEvolutionWrapperPass>(); + AU.addRequiredTransitive<ScopDetectionWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.setPreservesAll(); +} + void updateLoopCountStatistic(ScopDetection::LoopStats Stats, Scop::ScopStatistics ScopStats) { assert(Stats.NumLoops == ScopStats.NumAffineLoops + ScopStats.NumBoxedLoops); @@ -2577,6 +2592,112 @@ void updateLoopCountStatistic(ScopDetection::LoopStats Stats, NumSingletonWritesInLoops += ScopStats.NumSingletonWritesInLoops; } +bool ScopInfoRegionPass::runOnRegion(Region *R, RGPassManager &RGM) { + auto &SD = getAnalysis<ScopDetectionWrapperPass>().getSD(); + + if (!SD.isMaxRegionInScop(*R)) + return false; + + Function *F = R->getEntry()->getParent(); + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + auto const &DL = F->getParent()->getDataLayout(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F); + auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + + ScopBuilder SB(R, AC, AA, DL, DT, LI, SD, SE, ORE); + S = SB.getScop(); // take ownership of scop object + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS) + if (S) { + ScopDetection::LoopStats Stats = + ScopDetection::countBeneficialLoops(&S->getRegion(), SE, LI, 0); + updateLoopCountStatistic(Stats, S->getStatistics()); + } +#endif + + return false; +} + +void ScopInfoRegionPass::print(raw_ostream &OS, const Module *) const { + if (S) + S->print(OS, PollyPrintInstructions); + else + OS << "Invalid Scop!\n"; +} + +char ScopInfoRegionPass::ID = 0; + +Pass *polly::createScopInfoRegionPassPass() { return new ScopInfoRegionPass(); } + +INITIALIZE_PASS_BEGIN(ScopInfoRegionPass, "polly-scops", + "Polly - Create polyhedral description of Scops", false, + false); +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass); +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker); +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); +INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); +INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_END(ScopInfoRegionPass, "polly-scops", + "Polly - Create polyhedral description of Scops", false, + false) + +//===----------------------------------------------------------------------===// + +namespace { + +/// Print result from ScopInfoRegionPass. +class ScopInfoPrinterLegacyRegionPass final : public RegionPass { +public: + static char ID; + + ScopInfoPrinterLegacyRegionPass() : ScopInfoPrinterLegacyRegionPass(outs()) {} + + explicit ScopInfoPrinterLegacyRegionPass(llvm::raw_ostream &OS) + : RegionPass(ID), OS(OS) {} + + bool runOnRegion(Region *R, RGPassManager &RGM) override { + ScopInfoRegionPass &P = getAnalysis<ScopInfoRegionPass>(); + + OS << "Printing analysis '" << P.getPassName() << "' for region: '" + << R->getNameStr() << "' in function '" + << R->getEntry()->getParent()->getName() << "':\n"; + P.print(OS); + + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + RegionPass::getAnalysisUsage(AU); + AU.addRequired<ScopInfoRegionPass>(); + AU.setPreservesAll(); + } + +private: + llvm::raw_ostream &OS; +}; + +char ScopInfoPrinterLegacyRegionPass::ID = 0; +} // namespace + +Pass *polly::createScopInfoPrinterLegacyRegionPass(raw_ostream &OS) { + return new ScopInfoPrinterLegacyRegionPass(OS); +} + +INITIALIZE_PASS_BEGIN(ScopInfoPrinterLegacyRegionPass, "polly-print-scops", + "Polly - Print polyhedral description of Scops", false, + false); +INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); +INITIALIZE_PASS_END(ScopInfoPrinterLegacyRegionPass, "polly-print-scops", + "Polly - Print polyhedral description of Scops", false, + false) + +//===----------------------------------------------------------------------===// + ScopInfo::ScopInfo(const DataLayout &DL, ScopDetection &SD, ScalarEvolution &SE, LoopInfo &LI, AliasAnalysis &AA, DominatorTree &DT, AssumptionCache &AC, OptimizationRemarkEmitter &ORE) @@ -2650,3 +2771,110 @@ PreservedAnalyses ScopInfoPrinterPass::run(Function &F, } return PreservedAnalyses::all(); } + +void ScopInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<RegionInfoPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequiredTransitive<ScalarEvolutionWrapperPass>(); + AU.addRequiredTransitive<ScopDetectionWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.setPreservesAll(); +} + +bool ScopInfoWrapperPass::runOnFunction(Function &F) { + auto &SD = getAnalysis<ScopDetectionWrapperPass>().getSD(); + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + auto const &DL = F.getParent()->getDataLayout(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + + Result.reset(new ScopInfo{DL, SD, SE, LI, AA, DT, AC, ORE}); + return false; +} + +void ScopInfoWrapperPass::print(raw_ostream &OS, const Module *) const { + for (auto &It : *Result) { + if (It.second) + It.second->print(OS, PollyPrintInstructions); + else + OS << "Invalid Scop!\n"; + } +} + +char ScopInfoWrapperPass::ID = 0; + +Pass *polly::createScopInfoWrapperPassPass() { + return new ScopInfoWrapperPass(); +} + +INITIALIZE_PASS_BEGIN( + ScopInfoWrapperPass, "polly-function-scops", + "Polly - Create polyhedral description of all Scops of a function", false, + false); +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass); +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker); +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); +INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); +INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_END( + ScopInfoWrapperPass, "polly-function-scops", + "Polly - Create polyhedral description of all Scops of a function", false, + false) + +//===----------------------------------------------------------------------===// + +namespace { +/// Print result from ScopInfoWrapperPass. +class ScopInfoPrinterLegacyFunctionPass final : public FunctionPass { +public: + static char ID; + + ScopInfoPrinterLegacyFunctionPass() + : ScopInfoPrinterLegacyFunctionPass(outs()) {} + explicit ScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS) + : FunctionPass(ID), OS(OS) {} + + bool runOnFunction(Function &F) override { + ScopInfoWrapperPass &P = getAnalysis<ScopInfoWrapperPass>(); + + OS << "Printing analysis '" << P.getPassName() << "' for function '" + << F.getName() << "':\n"; + P.print(OS); + + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + FunctionPass::getAnalysisUsage(AU); + AU.addRequired<ScopInfoWrapperPass>(); + AU.setPreservesAll(); + } + +private: + llvm::raw_ostream &OS; +}; + +char ScopInfoPrinterLegacyFunctionPass::ID = 0; +} // namespace + +Pass *polly::createScopInfoPrinterLegacyFunctionPass(raw_ostream &OS) { + return new ScopInfoPrinterLegacyFunctionPass(OS); +} + +INITIALIZE_PASS_BEGIN( + ScopInfoPrinterLegacyFunctionPass, "polly-print-function-scops", + "Polly - Print polyhedral description of all Scops of a function", false, + false); +INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass); +INITIALIZE_PASS_END( + ScopInfoPrinterLegacyFunctionPass, "polly-print-function-scops", + "Polly - Print polyhedral description of all Scops of a function", false, + false) diff --git a/polly/lib/Analysis/ScopPass.cpp b/polly/lib/Analysis/ScopPass.cpp index 61417e799cfa5..719cd0f6984e0 100644 --- a/polly/lib/Analysis/ScopPass.cpp +++ b/polly/lib/Analysis/ScopPass.cpp @@ -24,6 +24,42 @@ using namespace llvm; using namespace polly; +bool ScopPass::runOnRegion(Region *R, RGPassManager &RGM) { + S = nullptr; + + if (skipRegion(*R)) + return false; + + if ((S = getAnalysis<ScopInfoRegionPass>().getScop())) + return runOnScop(*S); + + return false; +} + +void ScopPass::print(raw_ostream &OS, const Module *M) const { + if (S) + printScop(OS, *S); +} + +void ScopPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<ScopInfoRegionPass>(); + + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScopDetectionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); + AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); + AU.addPreserved<LazyBlockFrequencyInfoPass>(); + AU.addPreserved<LazyBranchProbabilityInfoPass>(); + AU.addPreserved<RegionInfoPass>(); + AU.addPreserved<ScopInfoRegionPass>(); + AU.addPreserved<TargetTransformInfoWrapperPass>(); +} + namespace polly { template class OwningInnerAnalysisManagerProxy<ScopAnalysisManager, Function>; } diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt index e4f196f151c9e..0ed673815ff34 100644 --- a/polly/lib/CMakeLists.txt +++ b/polly/lib/CMakeLists.txt @@ -60,9 +60,6 @@ add_llvm_pass_plugin(Polly CodeGen/RuntimeDebugBuilder.cpp CodeGen/PerfMonitor.cpp Exchange/JSONExporter.cpp - Pass/PhaseManager.cpp - Pass/PollyFunctionPass.cpp - Pass/PollyModulePass.cpp Support/GICHelper.cpp Support/PollyDebug.cpp Support/SCEVAffinator.cpp diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp index f2d5a3422849e..2d8b393cc039c 100644 --- a/polly/lib/CodeGen/CodeGeneration.cpp +++ b/polly/lib/CodeGen/CodeGeneration.cpp @@ -25,6 +25,7 @@ #include "polly/CodeGen/PerfMonitor.h" #include "polly/CodeGen/Utils.h" #include "polly/DependenceInfo.h" +#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/Support/ScopHelper.h" @@ -36,6 +37,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -312,6 +314,59 @@ static bool generateCode(Scop &S, IslAstInfo &AI, LoopInfo &LI, return true; } +namespace { + +class CodeGeneration final : public ScopPass { +public: + static char ID; + + /// The data layout used. + const DataLayout *DL; + + /// @name The analysis passes we need to generate code. + /// + ///{ + LoopInfo *LI; + IslAstInfo *AI; + DominatorTree *DT; + ScalarEvolution *SE; + RegionInfo *RI; + ///} + + CodeGeneration() : ScopPass(ID) {} + + /// Generate LLVM-IR for the SCoP @p S. + bool runOnScop(Scop &S) override { + AI = &getAnalysis<IslAstInfoWrapperPass>().getAI(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + DL = &S.getFunction().getDataLayout(); + RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); + return generateCode(S, *AI, *LI, *DT, *SE, *RI); + } + + /// Register all analyses and transformation required. + void getAnalysisUsage(AnalysisUsage &AU) const override { + ScopPass::getAnalysisUsage(AU); + + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<IslAstInfoWrapperPass>(); + AU.addRequired<RegionInfoPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<ScopDetectionWrapperPass>(); + AU.addRequired<ScopInfoRegionPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + + AU.addPreserved<DependenceInfo>(); + AU.addPreserved<IslAstInfoWrapperPass>(); + + // FIXME: We do not yet add regions for the newly generated code to the + // region tree. + } +}; +} // namespace + PreservedAnalyses CodeGenerationPass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &AR, SPMUpdater &U) { @@ -324,6 +379,17 @@ PreservedAnalyses CodeGenerationPass::run(Scop &S, ScopAnalysisManager &SAM, return PreservedAnalyses::all(); } -bool polly::runCodeGeneration(Scop &S, RegionInfo &RI, IslAstInfo &AI) { - return generateCode(S, AI, *S.getLI(), *S.getDT(), *S.getSE(), RI); -} +char CodeGeneration::ID = 1; + +Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); } + +INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen", + "Polly - Create LLVM-IR from SCoPs", false, false); +INITIALIZE_PASS_DEPENDENCY(DependenceInfo); +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); +INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); +INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); +INITIALIZE_PASS_END(CodeGeneration, "polly-codegen", + "Polly - Create LLVM-IR from SCoPs", false, false) diff --git a/polly/lib/CodeGen/IslAst.cpp b/polly/lib/CodeGen/IslAst.cpp index 3177cda225f1d..09bacda196742 100644 --- a/polly/lib/CodeGen/IslAst.cpp +++ b/polly/lib/CodeGen/IslAst.cpp @@ -29,6 +29,7 @@ #include "polly/CodeGen/IslAst.h" #include "polly/CodeGen/CodeGeneration.h" #include "polly/DependenceInfo.h" +#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopDetection.h" #include "polly/ScopInfo.h" @@ -82,11 +83,6 @@ static cl::opt<bool> DetectParallel("polly-ast-detect-parallel", cl::desc("Detect parallelism"), cl::Hidden, cl::cat(PollyCategory)); -static cl::opt<bool> - PollyPrintAst("polly-print-ast", - cl::desc("Print the ISL abstract syntax tree"), - cl::cat(PollyCategory)); - STATISTIC(ScopsProcessed, "Number of SCoPs processed"); STATISTIC(ScopsBeneficial, "Number of beneficial SCoPs"); STATISTIC(BeneficialAffineLoops, "Number of beneficial affine loops"); @@ -780,19 +776,90 @@ PreservedAnalyses IslAstPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, return PreservedAnalyses::all(); } -std::unique_ptr<IslAstInfo> -polly::runIslAstGen(Scop &S, DependenceAnalysis::Result &DA) { - auto GetDeps = [&](Dependences::AnalysisLevel Lvl) -> const Dependences & { - return DA.getDependences(Lvl); +void IslAstInfoWrapperPass::releaseMemory() { Ast.reset(); } + +bool IslAstInfoWrapperPass::runOnScop(Scop &Scop) { + auto GetDeps = [this](Dependences::AnalysisLevel Lvl) -> const Dependences & { + return getAnalysis<DependenceInfo>().getDependences(Lvl); }; - std::unique_ptr<IslAstInfo> Result = runIslAst(S, GetDeps); - if (PollyPrintAst) { - outs() << "Printing analysis 'Polly - Generate an AST of the SCoP (isl)'" - << S.getName() << "' in function '" << S.getFunction().getName() - << "':\n"; - if (Result) - Result->print(llvm::outs()); + Ast = runIslAst(Scop, GetDeps); + + return false; +} + +void IslAstInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + // Get the Common analysis usage of ScopPasses. + ScopPass::getAnalysisUsage(AU); + AU.addRequiredTransitive<ScopInfoRegionPass>(); + AU.addRequired<DependenceInfo>(); + + AU.addPreserved<DependenceInfo>(); +} + +void IslAstInfoWrapperPass::printScop(raw_ostream &OS, Scop &S) const { + OS << "Printing analysis 'Polly - Generate an AST of the SCoP (isl)'" + << S.getName() << "' in function '" << S.getFunction().getName() << "':\n"; + if (Ast) + Ast->print(OS); +} + +char IslAstInfoWrapperPass::ID = 0; + +Pass *polly::createIslAstInfoWrapperPassPass() { + return new IslAstInfoWrapperPass(); +} + +INITIALIZE_PASS_BEGIN(IslAstInfoWrapperPass, "polly-ast", + "Polly - Generate an AST of the SCoP (isl)", false, + false); +INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); +INITIALIZE_PASS_DEPENDENCY(DependenceInfo); +INITIALIZE_PASS_END(IslAstInfoWrapperPass, "polly-ast", + "Polly - Generate an AST from the SCoP (isl)", false, false) + +//===----------------------------------------------------------------------===// + +namespace { +/// Print result from IslAstInfoWrapperPass. +class IslAstInfoPrinterLegacyPass final : public ScopPass { +public: + static char ID; + + IslAstInfoPrinterLegacyPass() : IslAstInfoPrinterLegacyPass(outs()) {} + explicit IslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS) + : ScopPass(ID), OS(OS) {} + + bool runOnScop(Scop &S) override { + IslAstInfoWrapperPass &P = getAnalysis<IslAstInfoWrapperPass>(); + + OS << "Printing analysis '" << P.getPassName() << "' for region: '" + << S.getRegion().getNameStr() << "' in function '" + << S.getFunction().getName() << "':\n"; + P.printScop(OS, S); + + return false; } - return Result; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<IslAstInfoWrapperPass>(); + AU.setPreservesAll(); + } + +private: + llvm::raw_ostream &OS; +}; + +char IslAstInfoPrinterLegacyPass::ID = 0; +} // namespace + +Pass *polly::createIslAstInfoPrinterLegacyPass(raw_ostream &OS) { + return new IslAstInfoPrinterLegacyPass(OS); } + +INITIALIZE_PASS_BEGIN(IslAstInfoPrinterLegacyPass, "polly-print-ast", + "Polly - Print the AST from a SCoP (isl)", false, false); +INITIALIZE_PASS_DEPENDENCY(IslAstInfoWrapperPass); +INITIALIZE_PASS_END(IslAstInfoPrinterLegacyPass, "polly-print-ast", + "Polly - Print the AST from a SCoP (isl)", false, false) diff --git a/polly/lib/Exchange/JSONExporter.cpp b/polly/lib/Exchange/JSONExporter.cpp index 7d30c030aa6e1..dfd63146edb5e 100644 --- a/polly/lib/Exchange/JSONExporter.cpp +++ b/polly/lib/Exchange/JSONExporter.cpp @@ -12,6 +12,7 @@ #include "polly/JSONExporter.h" #include "polly/DependenceInfo.h" +#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/ScopPass.h" @@ -35,11 +36,6 @@ using namespace polly; #define DEBUG_TYPE "polly-import-jscop" -static cl::opt<bool> - PollyPrintImportJscop("polly-print-import-jscop", - cl::desc("Polly - Print Scop import result"), - cl::cat(PollyCategory)); - STATISTIC(NewAccessMapFound, "Number of updated access functions"); namespace { @@ -54,6 +50,36 @@ static cl::opt<std::string> cl::desc("Postfix to append to the import .jsop files."), cl::Hidden, cl::value_desc("File postfix"), cl::ValueRequired, cl::init(""), cl::cat(PollyCategory)); + +class JSONExporter : public ScopPass { +public: + static char ID; + explicit JSONExporter() : ScopPass(ID) {} + + /// Export the SCoP @p S to a JSON file. + bool runOnScop(Scop &S) override; + + /// Print the SCoP @p S as it is exported. + void printScop(raw_ostream &OS, Scop &S) const override; + + /// Register all analyses and transformation required. + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +class JSONImporter : public ScopPass { +public: + static char ID; + std::vector<std::string> NewAccessStrings; + explicit JSONImporter() : ScopPass(ID) {} + /// Import new access functions for SCoP @p S from a JSON file. + bool runOnScop(Scop &S) override; + + /// Print the SCoP @p S and the imported access functions. + void printScop(raw_ostream &OS, Scop &S) const override; + + /// Register all analyses and transformation required. + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; } // namespace static std::string getFileName(Scop &S, StringRef Suffix = "") { @@ -716,6 +742,21 @@ static bool importScop(Scop &S, const Dependences &D, const DataLayout &DL, return true; } +char JSONExporter::ID = 0; +void JSONExporter::printScop(raw_ostream &OS, Scop &S) const { OS << S; } + +bool JSONExporter::runOnScop(Scop &S) { + exportScop(S); + return false; +} + +void JSONExporter::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<ScopInfoRegionPass>(); +} + +Pass *polly::createJSONExporterPass() { return new JSONExporter(); } + PreservedAnalyses JSONExportPass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &) { @@ -723,6 +764,37 @@ PreservedAnalyses JSONExportPass::run(Scop &S, ScopAnalysisManager &SAM, return PreservedAnalyses::all(); } +char JSONImporter::ID = 0; + +void JSONImporter::printScop(raw_ostream &OS, Scop &S) const { + OS << S; + for (std::vector<std::string>::const_iterator I = NewAccessStrings.begin(), + E = NewAccessStrings.end(); + I != E; I++) + OS << "New access function '" << *I << "' detected in JSCOP file\n"; +} + +bool JSONImporter::runOnScop(Scop &S) { + const Dependences &D = + getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement); + const DataLayout &DL = S.getFunction().getParent()->getDataLayout(); + + if (!importScop(S, D, DL, &NewAccessStrings)) + report_fatal_error("Tried to import a malformed jscop file."); + + return false; +} + +void JSONImporter::getAnalysisUsage(AnalysisUsage &AU) const { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<DependenceInfo>(); + + // TODO: JSONImporter should throw away DependenceInfo. + AU.addPreserved<DependenceInfo>(); +} + +Pass *polly::createJSONImporterPass() { return new JSONImporter(); } + PreservedAnalyses JSONImportPass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &) { @@ -742,24 +814,68 @@ PreservedAnalyses JSONImportPass::run(Scop &S, ScopAnalysisManager &SAM, return PA; } -void polly::runImportJSON(Scop &S, DependenceAnalysis::Result &DA) { - const Dependences &D = DA.getDependences(Dependences::AL_Statement); - const DataLayout &DL = S.getFunction().getParent()->getDataLayout(); - std::vector<std::string> NewAccessStrings; - if (!importScop(S, D, DL, &NewAccessStrings)) - report_fatal_error("Tried to import a malformed jscop file."); +INITIALIZE_PASS_BEGIN(JSONExporter, "polly-export-jscop", + "Polly - Export Scops as JSON" + " (Writes a .jscop file for each Scop)", + false, false); +INITIALIZE_PASS_DEPENDENCY(DependenceInfo) +INITIALIZE_PASS_END(JSONExporter, "polly-export-jscop", + "Polly - Export Scops as JSON" + " (Writes a .jscop file for each Scop)", + false, false) + +INITIALIZE_PASS_BEGIN(JSONImporter, "polly-import-jscop", + "Polly - Import Scops from JSON" + " (Reads a .jscop file for each Scop)", + false, false); +INITIALIZE_PASS_DEPENDENCY(DependenceInfo) +INITIALIZE_PASS_END(JSONImporter, "polly-import-jscop", + "Polly - Import Scops from JSON" + " (Reads a .jscop file for each Scop)", + false, false) + +//===----------------------------------------------------------------------===// - if (PollyPrintImportJscop) { - outs() - << "Printing analysis 'Polly - Print Scop import result' for region: '" - << S.getRegion().getNameStr() << "' in function '" - << S.getFunction().getName() << "':\n"; - outs() << S; - for (std::vector<std::string>::const_iterator I = NewAccessStrings.begin(), - E = NewAccessStrings.end(); - I != E; I++) - outs() << "New access function '" << *I << "' detected in JSCOP file\n"; +namespace { +/// Print result from JSONImporter. +class JSONImporterPrinterLegacyPass final : public ScopPass { +public: + static char ID; + + JSONImporterPrinterLegacyPass() : JSONImporterPrinterLegacyPass(outs()) {} + explicit JSONImporterPrinterLegacyPass(llvm::raw_ostream &OS) + : ScopPass(ID), OS(OS) {} + + bool runOnScop(Scop &S) override { + JSONImporter &P = getAnalysis<JSONImporter>(); + + OS << "Printing analysis '" << P.getPassName() << "' for region: '" + << S.getRegion().getNameStr() << "' in function '" + << S.getFunction().getName() << "':\n"; + P.printScop(OS, S); + + return false; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<JSONImporter>(); + AU.setPreservesAll(); + } + +private: + llvm::raw_ostream &OS; +}; + +char JSONImporterPrinterLegacyPass::ID = 0; +} // namespace + +Pass *polly::createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS) { + return new JSONImporterPrinterLegacyPass(OS); } -void polly::runExportJSON(Scop &S) { exportScop(S); } +INITIALIZE_PASS_BEGIN(JSONImporterPrinterLegacyPass, "polly-print-import-jscop", + "Polly - Print Scop import result", false, false) +INITIALIZE_PASS_DEPENDENCY(JSONImporter) +INITIALIZE_PASS_END(JSONImporterPrinterLegacyPass, "polly-print-import-jscop", + "Polly - Print Scop import result", false, false) diff --git a/polly/lib/Pass/PhaseManager.cpp b/polly/lib/Pass/PhaseManager.cpp deleted file mode 100644 index fb76c811859b8..0000000000000 --- a/polly/lib/Pass/PhaseManager.cpp +++ /dev/null @@ -1,424 +0,0 @@ -//===------ PhaseManager.cpp ------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "polly/Pass/PhaseManager.h" -#include "polly/CodeGen/CodeGeneration.h" -#include "polly/CodeGen/IslAst.h" -#include "polly/CodePreparation.h" -#include "polly/DeLICM.h" -#include "polly/DeadCodeElimination.h" -#include "polly/DependenceInfo.h" -#include "polly/FlattenSchedule.h" -#include "polly/ForwardOpTree.h" -#include "polly/JSONExporter.h" -#include "polly/MaximalStaticExpansion.h" -#include "polly/PruneUnprofitable.h" -#include "polly/ScheduleOptimizer.h" -#include "polly/ScopDetection.h" -#include "polly/ScopDetectionDiagnostic.h" -#include "polly/ScopGraphPrinter.h" -#include "polly/ScopInfo.h" -#include "polly/Simplify.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/IR/Module.h" - -#define DEBUG_TYPE "polly-pass" - -using namespace polly; -using namespace llvm; - -namespace { - -/// Recurse through all subregions and all regions and add them to RQ. -static void addRegionIntoQueue(Region &R, SmallVector<Region *> &RQ) { - RQ.push_back(&R); - for (const auto &E : R) - addRegionIntoQueue(*E, RQ); -} - -/// The phase pipeline of Polly to be embedded into another pass manager than -/// runs passes on functions. -/// -/// Polly holds state besides LLVM-IR (RegionInfo and ScopInfo) between phases -/// that LLVM pass managers do not consider when scheduling analyses and passes. -/// That is, the ScopInfo must persist between phases that a pass manager must -/// not invalidate to recompute later. -class PhaseManager { -private: - Function &F; - FunctionAnalysisManager &FAM; - PollyPassOptions Opts; - -public: - PhaseManager(Function &F, FunctionAnalysisManager &FAM, PollyPassOptions Opts) - : F(F), FAM(FAM), Opts(std::move(Opts)) {} - - /// Execute Polly's phases as indicated by the options. - bool run() { - // Get analyses from the function pass manager. - // These must be preserved during all phases so that if processing one SCoP - // has finished, the next SCoP can still use them. Recomputing is not an - // option because ScopDetection stores references to the old results. - // TODO: CodePreparation doesn't actually need these analysis, it just keeps - // them up-to-date. If they are not computed yet, can also compute after the - // prepare phase. - LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); - DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); - bool ModifiedIR = false; - - // Phase: prepare - // TODO: Setting ModifiedIR will invalidate any analysis, even if DT, LI are - // preserved. - if (Opts.isPhaseEnabled(PassPhase::Prepare)) { - PreservedAnalyses PA = CodePreparationPass().run(F, FAM); - FAM.invalidate(F, PA); - if (!PA.areAllPreserved()) - ModifiedIR = true; - } - - // Can't do anything without detection - if (!Opts.isPhaseEnabled(PassPhase::Detection)) - return false; - - AAResults &AA = FAM.getResult<AAManager>(F); - ScalarEvolution &SE = FAM.getResult<ScalarEvolutionAnalysis>(F); - OptimizationRemarkEmitter &ORE = - FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); - - // ScopDetection is modifying RegionInfo, do not cache it, nor use a cached - // version. - RegionInfo RI = RegionInfoAnalysis().run(F, FAM); - - // Phase: detection - ScopDetection SD(DT, SE, LI, RI, AA, ORE); - SD.detect(F); - if (Opts.isPhaseEnabled(PassPhase::PrintDetect)) { - outs() << "Detected Scops in Function " << F.getName() << "\n"; - for (const Region *R : SD.ValidRegions) - outs() << "Valid Region for Scop: " << R->getNameStr() << '\n'; - outs() << "\n"; - } - - if (Opts.isPhaseEnabled(PassPhase::DotScops)) - printGraphForFunction(F, &SD, "scops", false); - if (Opts.isPhaseEnabled(PassPhase::DotScopsOnly)) - printGraphForFunction(F, &SD, "scopsonly", true); - - auto ViewScops = [&](const char *Name, bool IsSimply) { - if (Opts.ViewFilter.empty() && !F.getName().count(Opts.ViewFilter)) - return; - - if (Opts.ViewAll || std::distance(SD.begin(), SD.end()) > 0) - viewGraphForFunction(F, &SD, Name, IsSimply); - }; - if (Opts.isPhaseEnabled(PassPhase::ViewScops)) - ViewScops("scops", false); - if (Opts.isPhaseEnabled(PassPhase::ViewScopsOnly)) - ViewScops("scopsonly", true); - - // Phase: scops - AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F); - const DataLayout &DL = F.getParent()->getDataLayout(); - ScopInfo Info(DL, SD, SE, LI, AA, DT, AC, ORE); - if (Opts.isPhaseEnabled(PassPhase::PrintScopInfo)) { - if (Region *TLR = RI.getTopLevelRegion()) { - SmallVector<Region *> Regions; - addRegionIntoQueue(*TLR, Regions); - - // reverse iteration because the regression tests expect it. - for (Region *R : reverse(Regions)) { - Scop *S = Info.getScop(R); - outs() << "Printing analysis 'Polly - Create polyhedral " - "description of Scops' for region: '" - << R->getNameStr() << "' in function '" << F.getName() - << "':\n"; - if (S) - outs() << *S; - else - outs() << "Invalid Scop!\n"; - } - } - } - - SmallPriorityWorklist<Region *, 4> Worklist; - for (auto &[R, S] : Info) - if (S) - Worklist.insert(R); - - TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F); - while (!Worklist.empty()) { - Region *R = Worklist.pop_back_val(); - if (!SD.isMaxRegionInScop(*R, /*Verify=*/false)) - continue; - Scop *S = Info.getScop(R); - - // Phase: flatten - if (Opts.isPhaseEnabled(PassPhase::Flatten)) - runFlattenSchedulePass(*S); - - // Phase: deps - // Actual analysis runs on-demand, so it does not matter whether the phase - // is actually enabled, but use this location to print dependencies. - DependenceAnalysis::Result DA = runDependenceAnalysis(*S); - if (Opts.isPhaseEnabled(PassPhase::PrintDependences)) { - assert(Opts.isPhaseEnabled(PassPhase::Dependences)); - const Dependences &D = DA.getDependences(Opts.PrintDepsAnalysisLevel); - D.print(outs()); - } - - // Phase: import-jscop - if (Opts.isPhaseEnabled(PassPhase::ImportJScop)) - runImportJSON(*S, DA); - - // Phase: simplify-0 - bool ModifiedSinceSimplify = true; - if (Opts.isPhaseEnabled(PassPhase::Simplify0)) { - runSimplify(*S, 0); - ModifiedSinceSimplify = false; - } - - // Phase: optree - if (Opts.isPhaseEnabled(PassPhase::Optree)) { - bool ModifiedByOptree = runForwardOpTree(*S); - ModifiedSinceSimplify |= ModifiedByOptree; - } - - // Phase: delicm - if (Opts.isPhaseEnabled(PassPhase::DeLICM)) { - bool ModifiedByDelicm = runDeLICM(*S); - ModifiedSinceSimplify |= ModifiedByDelicm; - } - - // Phase: simplify-1 - // If we have already run simplify-0, do not re-run it if the SCoP has not - // changed since then. - if (ModifiedSinceSimplify && Opts.isPhaseEnabled(PassPhase::Simplify1)) { - runSimplify(*S, 1); - ModifiedSinceSimplify = false; - } - - // Phase: dce - if (Opts.isPhaseEnabled(PassPhase::DeadCodeElimination)) - runDeadCodeElim(*S, DA); - - // Phase: mse - if (Opts.isPhaseEnabled(PassPhase::MaximumStaticExtension)) - runMaximalStaticExpansion(*S, DA); - - // Phase: prune - if (Opts.isPhaseEnabled(PassPhase::PruneUnprofitable)) - runPruneUnprofitable(*S); - - // Phase: opt-isl - if (Opts.isPhaseEnabled(PassPhase::Optimization)) - runIslScheduleOptimizer(*S, &TTI, DA); - - // Phase: import-jscop - if (Opts.isPhaseEnabled(PassPhase::ExportJScop)) - runExportJSON(*S); - - // Phase: ast - // Cannot run codegen unless ast is enabled - if (!Opts.isPhaseEnabled(PassPhase::AstGen)) - continue; - std::unique_ptr<IslAstInfo> IslAst = runIslAstGen(*S, DA); - - // Phase: codegen - if (!Opts.isPhaseEnabled(PassPhase::CodeGen)) - continue; - bool ModifiedByCodeGen = runCodeGeneration(*S, RI, *IslAst); - if (ModifiedByCodeGen) { - ModifiedIR = true; - - // For all regions, create new polly::Scop objects because the old ones - // refere to invalidated LLVM-IR. - // FIXME: Adds all SCoPs again to statistics - Info.recompute(); - } - } - - return ModifiedIR; - } -}; -} // namespace - -StringRef polly::getPhaseName(PassPhase Phase) { - switch (Phase) { - case PassPhase::Prepare: - return "prepare"; - case PassPhase::Detection: - return "detect"; - case PassPhase::PrintDetect: - return "print-detect"; - case PassPhase::DotScops: - return "dot-scops"; - case PassPhase::DotScopsOnly: - return "dot-scops-only"; - case PassPhase::ViewScops: - return "view-scops"; - case PassPhase::ViewScopsOnly: - return "view-scops-only"; - case PassPhase::ScopInfo: - return "scops"; - case PassPhase::PrintScopInfo: - return "print-scops"; - case PassPhase::Flatten: - return "flatten"; - case PassPhase::Dependences: - return "deps"; - case PassPhase::PrintDependences: - return "print-deps"; - case PassPhase::ImportJScop: - return "import-jscop"; - case PassPhase::Simplify0: - return "simplify-0"; - case PassPhase::Optree: - return "optree"; - case PassPhase::DeLICM: - return "delicm"; - case PassPhase::Simplify1: - return "simplify-1"; - case PassPhase::DeadCodeElimination: - return "dce"; - case PassPhase::MaximumStaticExtension: - return "mse"; - case PassPhase::PruneUnprofitable: - return "prune"; - case PassPhase::Optimization: - return "opt-isl"; // "opt" would conflict with the llvm executable - case PassPhase::ExportJScop: - return "export-jscop"; - case PassPhase::AstGen: - return "ast"; - case PassPhase::CodeGen: - return "codegen"; - default: - llvm_unreachable("Unexpected phase"); - } -} - -PassPhase polly::parsePhase(StringRef Name) { - return StringSwitch<PassPhase>(Name) - .Case("prepare", PassPhase::Prepare) - .Case("detect", PassPhase::Detection) - .Case("print-detect", PassPhase::PrintDetect) - .Case("dot-scops", PassPhase::DotScops) - .Case("dot-scops-only", PassPhase::DotScopsOnly) - .Case("view-scops", PassPhase::ViewScops) - .Case("view-scops-only", PassPhase::ViewScopsOnly) - .Case("scops", PassPhase::ScopInfo) - .Case("print-scops", PassPhase::PrintScopInfo) - .Case("flatten", PassPhase::Flatten) - .Case("deps", PassPhase::Dependences) - .Case("print-deps", PassPhase::PrintDependences) - .Case("import-jscop", PassPhase::ImportJScop) - .Case("simplify-0", PassPhase::Simplify0) - .Case("optree", PassPhase::Optree) - .Case("delicm", PassPhase::DeLICM) - .Case("simplify-1", PassPhase::Simplify1) - .Case("dce", PassPhase::DeadCodeElimination) - .Case("mse", PassPhase::MaximumStaticExtension) - .Case("prune", PassPhase::PruneUnprofitable) - .Case("opt-isl", PassPhase::Optimization) - .Case("export-jscop", PassPhase::ExportJScop) - .Case("ast", PassPhase::AstGen) - .Case("codegen", PassPhase::CodeGen) - .Default(PassPhase::None); -} - -bool polly::dependsOnDependenceInfo(PassPhase Phase) { - // Nothing before dep phase can depend on it - if (static_cast<size_t>(Phase) <= static_cast<size_t>(PassPhase::Dependences)) - return false; - - switch (Phase) { - case PassPhase::Simplify0: - case PassPhase::Optree: - case PassPhase::DeLICM: - case PassPhase::Simplify1: - case PassPhase::PruneUnprofitable: - case PassPhase::ImportJScop: - case PassPhase::ExportJScop: - case PassPhase::AstGen: // transitively through codegen - case PassPhase::CodeGen: - return false; - default: - return true; - } -} - -void PollyPassOptions::enableEnd2End() { - setPhaseEnabled(PassPhase::Detection); - setPhaseEnabled(PassPhase::ScopInfo); - setPhaseEnabled(PassPhase::Dependences); - setPhaseEnabled(PassPhase::AstGen); - setPhaseEnabled(PassPhase::CodeGen); -} - -void PollyPassOptions::enableDefaultOpts() { - setPhaseEnabled(PassPhase::Prepare); - setPhaseEnabled(PassPhase::Simplify0); - setPhaseEnabled(PassPhase::Optree); - setPhaseEnabled(PassPhase::DeLICM); - setPhaseEnabled(PassPhase::Simplify1); - setPhaseEnabled(PassPhase::PruneUnprofitable); - setPhaseEnabled(PassPhase::Optimization); -} - -void PollyPassOptions::disableAfter(PassPhase Phase) { - assert(Phase != PassPhase::None); - for (PassPhase P : enum_seq_inclusive(Phase, PassPhase::PassPhaseLast)) { - if (P == Phase) - continue; - setPhaseEnabled(P, false); - } -} - -Error PollyPassOptions::checkConsistency() const { - for (PassPhase P : enum_seq_inclusive(PassPhase::PassPhaseFirst, - PassPhase::PassPhaseLast)) { - if (!isPhaseEnabled(P)) - continue; - - // Prepare and Detection have no requirements - if (P == PassPhase::Prepare || P == PassPhase::Detection) - continue; - - if (!isPhaseEnabled(PassPhase::Detection)) - return make_error<StringError>( - formatv("'{0}' requires 'detect' to be enabled", getPhaseName(P)) - .str(), - inconvertibleErrorCode()); - - if (static_cast<size_t>(P) < static_cast<size_t>(PassPhase::ScopInfo)) - continue; - - if (!isPhaseEnabled(PassPhase::ScopInfo)) - return make_error<StringError>( - formatv("'{0}' requires 'scops' to be enabled", getPhaseName(P)) - .str(), - inconvertibleErrorCode()); - - if (dependsOnDependenceInfo(P) && !isPhaseEnabled(PassPhase::Dependences)) - return make_error<StringError>( - formatv("'{0}' requires 'deps' to be enabled", getPhaseName(P)).str(), - inconvertibleErrorCode()); - } - - if (isPhaseEnabled(PassPhase::CodeGen) && !isPhaseEnabled(PassPhase::AstGen)) - return make_error<StringError>("'codegen' requires 'ast' to be enabled", - inconvertibleErrorCode()); - - return Error::success(); -} - -bool polly::runPollyPass(Function &F, FunctionAnalysisManager &FAM, - PollyPassOptions Opts) { - return PhaseManager(F, FAM, std::move(Opts)).run(); -} diff --git a/polly/lib/Pass/PollyFunctionPass.cpp b/polly/lib/Pass/PollyFunctionPass.cpp deleted file mode 100644 index a478e4df2ca20..0000000000000 --- a/polly/lib/Pass/PollyFunctionPass.cpp +++ /dev/null @@ -1,22 +0,0 @@ -//===------ PollyFunctionPass.cpp - Polly function pass ------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "polly/Pass/PollyFunctionPass.h" - -using namespace llvm; -using namespace polly; - -PreservedAnalyses PollyFunctionPass::run(llvm::Function &F, - llvm::FunctionAnalysisManager &FAM) { - bool ModifiedIR = runPollyPass(F, FAM, Opts); - - // Be conservative about preserved analyses. - // FIXME: May also need to invalidate/update Module/CGSCC passes, but cannot - // reach them within a FunctionPassManager. - return ModifiedIR ? PreservedAnalyses::none() : PreservedAnalyses::all(); -} diff --git a/polly/lib/Pass/PollyModulePass.cpp b/polly/lib/Pass/PollyModulePass.cpp deleted file mode 100644 index f56ee672b76af..0000000000000 --- a/polly/lib/Pass/PollyModulePass.cpp +++ /dev/null @@ -1,29 +0,0 @@ -//===------ PollyModulePass.cpp - Polly module pass ----------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "polly/Pass/PollyModulePass.h" -#include "llvm/IR/Module.h" - -using namespace llvm; -using namespace polly; - -PreservedAnalyses PollyModulePass::run(llvm::Module &M, - llvm::ModuleAnalysisManager &MAM) { - FunctionAnalysisManager &FAM = - MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - - bool ModifiedAnyIR = false; - for (Function &F : M) { - bool LocalModifiedIR = runPollyPass(F, FAM, Opts); - ModifiedAnyIR |= LocalModifiedIR; - } - - // Be conservative about preserved analyses, especially if parallel functions - // have been outlined. - return ModifiedAnyIR ? PreservedAnalyses::none() : PreservedAnalyses::all(); -} diff --git a/polly/lib/Support/DumpFunctionPass.cpp b/polly/lib/Support/DumpFunctionPass.cpp index 9565e2156aee6..e47b7fe0db966 100644 --- a/polly/lib/Support/DumpFunctionPass.cpp +++ b/polly/lib/Support/DumpFunctionPass.cpp @@ -13,6 +13,7 @@ #include "polly/Support/DumpFunctionPass.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassInstrumentation.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -81,10 +82,50 @@ static void runDumpFunction(llvm::Function &F, StringRef Suffix) { Out->keep(); LLVM_DEBUG(dbgs() << "Dump file " << Dumpfile << " written successfully\n"); } + +class DumpFunctionWrapperPass final : public FunctionPass { +private: + DumpFunctionWrapperPass(const DumpFunctionWrapperPass &) = delete; + const DumpFunctionWrapperPass & + operator=(const DumpFunctionWrapperPass &) = delete; + + std::string Suffix; + +public: + static char ID; + + explicit DumpFunctionWrapperPass() : FunctionPass(ID), Suffix("-dump") {} + + explicit DumpFunctionWrapperPass(std::string Suffix) + : FunctionPass(ID), Suffix(std::move(Suffix)) {} + + /// @name FunctionPass interface + //@{ + void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + bool runOnFunction(llvm::Function &F) override { + runDumpFunction(F, Suffix); + return false; + } + //@} +}; + +char DumpFunctionWrapperPass::ID; } // namespace +FunctionPass *polly::createDumpFunctionWrapperPass(std::string Suffix) { + return new DumpFunctionWrapperPass(std::move(Suffix)); +} + llvm::PreservedAnalyses DumpFunctionPass::run(Function &F, FunctionAnalysisManager &AM) { runDumpFunction(F, Suffix); return PreservedAnalyses::all(); } + +INITIALIZE_PASS_BEGIN(DumpFunctionWrapperPass, "polly-dump-function", + "Polly - Dump Function", false, false) +INITIALIZE_PASS_END(DumpFunctionWrapperPass, "polly-dump-function", + "Polly - Dump Function", false, false) diff --git a/polly/lib/Support/DumpModulePass.cpp b/polly/lib/Support/DumpModulePass.cpp index 2eaa0707fe571..c1c27ef6ac757 100644 --- a/polly/lib/Support/DumpModulePass.cpp +++ b/polly/lib/Support/DumpModulePass.cpp @@ -12,6 +12,7 @@ #include "polly/Support/DumpModulePass.h" #include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -46,10 +47,56 @@ static void runDumpModule(llvm::Module &M, StringRef Filename, bool IsSuffix) { M.print(Out->os(), nullptr); Out->keep(); } + +class DumpModuleWrapperPass final : public ModulePass { +private: + DumpModuleWrapperPass(const DumpModuleWrapperPass &) = delete; + const DumpModuleWrapperPass & + operator=(const DumpModuleWrapperPass &) = delete; + + std::string Filename; + bool IsSuffix; + +public: + static char ID; + + /// This constructor is used e.g. if using opt -polly-dump-module. + /// + /// Provide a default suffix to not overwrite the original file. + explicit DumpModuleWrapperPass() + : ModulePass(ID), Filename("-dump"), IsSuffix(true) {} + + explicit DumpModuleWrapperPass(std::string Filename, bool IsSuffix) + : ModulePass(ID), Filename(std::move(Filename)), IsSuffix(IsSuffix) {} + + /// @name ModulePass interface + //@{ + void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + bool runOnModule(llvm::Module &M) override { + runDumpModule(M, Filename, IsSuffix); + return false; + } + //@} +}; + +char DumpModuleWrapperPass::ID; } // namespace +ModulePass *polly::createDumpModuleWrapperPass(std::string Filename, + bool IsSuffix) { + return new DumpModuleWrapperPass(std::move(Filename), IsSuffix); +} + llvm::PreservedAnalyses DumpModulePass::run(llvm::Module &M, llvm::ModuleAnalysisManager &AM) { runDumpModule(M, Filename, IsSuffix); return PreservedAnalyses::all(); } + +INITIALIZE_PASS_BEGIN(DumpModuleWrapperPass, "polly-dump-module", + "Polly - Dump Module", false, false) +INITIALIZE_PASS_END(DumpModuleWrapperPass, "polly-dump-module", + "Polly - Dump Module", false, false) diff --git a/polly/lib/Support/PollyPasses.def b/polly/lib/Support/PollyPasses.def index 496839760a844..2c792a5867100 100644 --- a/polly/lib/Support/PollyPasses.def +++ b/polly/lib/Support/PollyPasses.def @@ -1,10 +1,3 @@ -#ifndef MODULE_PASS -#define MODULE_PASS(NAME, CREATE_PASS, PARSER) -#endif -MODULE_PASS("polly", createModuleToFunctionPassAdaptor(PollyFunctionPass(Opts)), parsePollyDefaultOptions) -MODULE_PASS("polly-custom", createModuleToFunctionPassAdaptor(PollyFunctionPass(Opts)), parsePollyCustomOptions) -#undef MODULE_PASS - #ifndef CGSCC_PASS #define CGSCC_PASS(NAME, CREATE_PASS, PARSER) #endif @@ -19,17 +12,15 @@ FUNCTION_ANALYSIS("polly-function-scops", ScopInfoAnalysis()) #undef FUNCTION_ANALYSIS #ifndef FUNCTION_PASS -#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER) +#define FUNCTION_PASS(NAME, CREATE_PASS) #endif -FUNCTION_PASS("polly-prepare", CodePreparationPass(), parseNoOptions) -FUNCTION_PASS("print<polly-detect>", ScopAnalysisPrinterPass(llvm::errs()), parseNoOptions) -FUNCTION_PASS("print<polly-function-scops>", ScopInfoPrinterPass(llvm::errs()), parseNoOptions) -FUNCTION_PASS("polly-scop-viewer", ScopViewer(), parseNoOptions) -FUNCTION_PASS("polly-scop-only-viewer", ScopOnlyViewer(), parseNoOptions) -FUNCTION_PASS("polly-scop-printer", ScopPrinter(), parseNoOptions) -FUNCTION_PASS("polly-scop-only-printer", ScopOnlyPrinter(), parseNoOptions) -FUNCTION_PASS("polly", PollyFunctionPass(Opts), parsePollyDefaultOptions) -FUNCTION_PASS("polly-custom", PollyFunctionPass(Opts), parsePollyCustomOptions) +FUNCTION_PASS("polly-prepare", CodePreparationPass()) +FUNCTION_PASS("print<polly-detect>", ScopAnalysisPrinterPass(llvm::errs())) +FUNCTION_PASS("print<polly-function-scops>", ScopInfoPrinterPass(llvm::errs())) +FUNCTION_PASS("polly-scop-viewer", ScopViewer()) +FUNCTION_PASS("polly-scop-only-viewer", ScopOnlyViewer()) +FUNCTION_PASS("polly-scop-printer", ScopPrinter()) +FUNCTION_PASS("polly-scop-only-printer", ScopOnlyPrinter()) #undef FUNCTION_PASS #ifndef SCOP_ANALYSIS diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp index 2f1d7a8362349..04f8715502c38 100644 --- a/polly/lib/Support/RegisterPasses.cpp +++ b/polly/lib/Support/RegisterPasses.cpp @@ -28,9 +28,8 @@ #include "polly/DependenceInfo.h" #include "polly/ForwardOpTree.h" #include "polly/JSONExporter.h" +#include "polly/LinkAllPasses.h" #include "polly/MaximalStaticExpansion.h" -#include "polly/Options.h" -#include "polly/Pass/PollyFunctionPass.h" #include "polly/PruneUnprofitable.h" #include "polly/ScheduleOptimizer.h" #include "polly/ScopDetection.h" @@ -53,8 +52,6 @@ #include "llvm/Transforms/IPO.h" using namespace llvm; -using namespace polly; - namespace cl = llvm::cl; using namespace polly; @@ -204,19 +201,58 @@ static cl::opt<bool> EnablePruneUnprofitable( cl::desc("Bail out on unprofitable SCoPs before rescheduling"), cl::Hidden, cl::init(true), cl::cat(PollyCategory)); -static cl::opt<bool> - PollyPrintDetect("polly-print-detect", - cl::desc("Polly - Print static control parts (SCoPs)"), - cl::cat(PollyCategory)); - -static cl::opt<bool> - PollyPrintScops("polly-print-scops", - cl::desc("Print polyhedral description of all regions"), - cl::cat(PollyCategory)); +namespace { -static cl::opt<bool> PollyPrintDeps("polly-print-deps", - cl::desc("Polly - Print dependences"), - cl::cat(PollyCategory)); +/// Initialize Polly passes when library is loaded. +/// +/// We use the constructor of a statically declared object to initialize the +/// different Polly passes right after the Polly library is loaded. This ensures +/// that the Polly passes are available e.g. in the 'opt' tool. +struct StaticInitializer { + StaticInitializer() { + llvm::PassRegistry &Registry = *llvm::PassRegistry::getPassRegistry(); + polly::initializePollyPasses(Registry); + } +}; +static StaticInitializer InitializeEverything; +} // end of anonymous namespace. + +void initializePollyPasses(llvm::PassRegistry &Registry) { + initializeCodeGenerationPass(Registry); + + initializeCodePreparationPass(Registry); + initializeDeadCodeElimWrapperPassPass(Registry); + initializeDependenceInfoPass(Registry); + initializeDependenceInfoPrinterLegacyPassPass(Registry); + initializeDependenceInfoWrapperPassPass(Registry); + initializeDependenceInfoPrinterLegacyFunctionPassPass(Registry); + initializeJSONExporterPass(Registry); + initializeJSONImporterPass(Registry); + initializeJSONImporterPrinterLegacyPassPass(Registry); + initializeMaximalStaticExpanderWrapperPassPass(Registry); + initializeIslAstInfoWrapperPassPass(Registry); + initializeIslAstInfoPrinterLegacyPassPass(Registry); + initializeIslScheduleOptimizerWrapperPassPass(Registry); + initializeIslScheduleOptimizerPrinterLegacyPassPass(Registry); + initializePollyCanonicalizePass(Registry); + initializeScopDetectionWrapperPassPass(Registry); + initializeScopDetectionPrinterLegacyPassPass(Registry); + initializeScopInlinerWrapperPassPass(Registry); + initializeScopInfoRegionPassPass(Registry); + initializeScopInfoPrinterLegacyRegionPassPass(Registry); + initializeScopInfoWrapperPassPass(Registry); + initializeScopInfoPrinterLegacyFunctionPassPass(Registry); + initializeFlattenSchedulePass(Registry); + initializeFlattenSchedulePrinterLegacyPassPass(Registry); + initializeForwardOpTreeWrapperPassPass(Registry); + initializeForwardOpTreePrinterLegacyPassPass(Registry); + initializeDeLICMWrapperPassPass(Registry); + initializeDeLICMPrinterLegacyPassPass(Registry); + initializeSimplifyWrapperPassPass(Registry); + initializeSimplifyPrinterLegacyPassPass(Registry); + initializeDumpModuleWrapperPassPass(Registry); + initializePruneUnprofitableWrapperPassPass(Registry); +} static bool shouldEnablePollyForOptimization() { return PollyEnabled; } @@ -230,198 +266,6 @@ static bool shouldEnablePollyForDiagnostic() { ExportJScop; } -/// Parser of parameters for LoopVectorize pass. -static llvm::Expected<PollyPassOptions> parsePollyOptions(StringRef Params, - bool IsCustom) { - PassPhase PrevPhase = PassPhase::None; - - bool EnableDefaultOpts = !IsCustom; - bool EnableEnd2End = !IsCustom; - std::optional<bool> - PassEnabled[static_cast<size_t>(PassPhase::PassPhaseLast) + 1]; - PassPhase StopAfter = PassPhase::None; - - // Passes enabled using command-line flags (can be overridden using - // 'polly<no-pass>') - if (PollyPrintDetect) - PassEnabled[static_cast<size_t>(PassPhase::PrintDetect)] = true; - if (PollyPrintScops) - PassEnabled[static_cast<size_t>(PassPhase::PrintScopInfo)] = true; - if (PollyPrintDeps) - PassEnabled[static_cast<size_t>(PassPhase::PrintDependences)] = true; - - if (PollyViewer) - PassEnabled[static_cast<size_t>(PassPhase::ViewScops)] = true; - if (PollyOnlyViewer) - PassEnabled[static_cast<size_t>(PassPhase::ViewScopsOnly)] = true; - if (PollyPrinter) - PassEnabled[static_cast<size_t>(PassPhase::DotScops)] = true; - if (PollyOnlyPrinter) - PassEnabled[static_cast<size_t>(PassPhase::DotScopsOnly)] = true; - if (!EnableSimplify) - PassEnabled[static_cast<size_t>(PassPhase::Simplify0)] = false; - if (!EnableForwardOpTree) - PassEnabled[static_cast<size_t>(PassPhase::Optree)] = false; - if (!EnableDeLICM) - PassEnabled[static_cast<size_t>(PassPhase::DeLICM)] = false; - if (!EnableSimplify) - PassEnabled[static_cast<size_t>(PassPhase::Simplify1)] = false; - if (ImportJScop) - PassEnabled[static_cast<size_t>(PassPhase::ImportJScop)] = true; - if (DeadCodeElim) - PassEnabled[static_cast<size_t>(PassPhase::DeadCodeElimination)] = true; - if (FullyIndexedStaticExpansion) - PassEnabled[static_cast<size_t>(PassPhase::MaximumStaticExtension)] = true; - if (!EnablePruneUnprofitable) - PassEnabled[static_cast<size_t>(PassPhase::PruneUnprofitable)] = false; - switch (Optimizer) { - case OPTIMIZER_NONE: - // explicitly switched off - PassEnabled[static_cast<size_t>(PassPhase::Optimization)] = false; - break; - case OPTIMIZER_ISL: - // default: enabled - break; - } - if (ExportJScop) - PassEnabled[static_cast<size_t>(PassPhase::ExportJScop)] = true; - switch (CodeGeneration) { - case CODEGEN_AST: - PassEnabled[static_cast<size_t>(PassPhase::AstGen)] = true; - PassEnabled[static_cast<size_t>(PassPhase::CodeGen)] = false; - break; - case CODEGEN_FULL: - // default: ast and codegen enabled - break; - case CODEGEN_NONE: - PassEnabled[static_cast<size_t>(PassPhase::AstGen)] = false; - PassEnabled[static_cast<size_t>(PassPhase::CodeGen)] = false; - break; - } - - while (!Params.empty()) { - StringRef Param; - std::tie(Param, Params) = Params.split(';'); - auto [ParamName, ParamVal] = Param.split('='); - - if (ParamName == "stopafter") { - StopAfter = parsePhase(ParamVal); - if (StopAfter == PassPhase::None) - return make_error<StringError>( - formatv("invalid stopafter parameter value '{0}'", ParamVal).str(), - inconvertibleErrorCode()); - continue; - } - - if (!ParamVal.empty()) - return make_error<StringError>( - formatv("parameter '{0}' does not take value", ParamName).str(), - inconvertibleErrorCode()); - - bool Enabled = true; - if (ParamName.starts_with("no-")) { - Enabled = false; - ParamName = ParamName.drop_front(3); - } - - if (ParamName == "default-opts") { - EnableDefaultOpts = Enabled; - continue; - } - - if (ParamName == "end2end") { - EnableEnd2End = Enabled; - continue; - } - - PassPhase Phase; - - // Shortcut for both simplifys at the same time - if (ParamName == "simplify") { - PassEnabled[static_cast<size_t>(PassPhase::Simplify0)] = Enabled; - PassEnabled[static_cast<size_t>(PassPhase::Simplify1)] = Enabled; - Phase = PassPhase::Simplify0; - } else { - Phase = parsePhase(ParamName); - if (Phase == PassPhase::None) - return make_error<StringError>( - formatv("invalid Polly parameter/phase name '{0}'", ParamName) - .str(), - inconvertibleErrorCode()); - - if (PrevPhase >= Phase) - return make_error<StringError>( - formatv("phases must not be repeated and enumerated in-order: " - "'{0}' listed before '{1}'", - getPhaseName(PrevPhase), getPhaseName(Phase)) - .str(), - inconvertibleErrorCode()); - - PassEnabled[static_cast<size_t>(Phase)] = Enabled; - } - PrevPhase = Phase; - } - - PollyPassOptions Opts; - Opts.ViewAll = ViewAll; - Opts.ViewFilter = ViewFilter; - Opts.PrintDepsAnalysisLevel = OptAnalysisLevel; - - // Implicitly enable dependent phases first. May be overriden explicitly - // on/off later. - for (PassPhase P : llvm::enum_seq_inclusive(PassPhase::PassPhaseFirst, - PassPhase::PassPhaseLast)) { - bool Enabled = PassEnabled[static_cast<size_t>(P)].value_or(false); - if (!Enabled) - continue; - - if (static_cast<size_t>(PassPhase::Detection) < static_cast<size_t>(P)) - Opts.setPhaseEnabled(PassPhase::Detection); - - if (static_cast<size_t>(PassPhase::ScopInfo) < static_cast<size_t>(P)) - Opts.setPhaseEnabled(PassPhase::ScopInfo); - - if (dependsOnDependenceInfo(P)) - Opts.setPhaseEnabled(PassPhase::Dependences); - - if (static_cast<size_t>(PassPhase::AstGen) < static_cast<size_t>(P)) - Opts.setPhaseEnabled(PassPhase::AstGen); - } - - if (EnableEnd2End) - Opts.enableEnd2End(); - - if (EnableDefaultOpts) - Opts.enableDefaultOpts(); - - for (PassPhase P : llvm::enum_seq_inclusive(PassPhase::PassPhaseFirst, - PassPhase::PassPhaseLast)) { - std::optional<bool> Enabled = PassEnabled[static_cast<size_t>(P)]; - - // Apply only if set explicitly. - if (Enabled.has_value()) - Opts.setPhaseEnabled(P, *Enabled); - } - - if (StopAfter != PassPhase::None) - Opts.disableAfter(StopAfter); - - if (Error CheckResult = Opts.checkConsistency()) - return CheckResult; - - return Opts; -} - -static llvm::Expected<PollyPassOptions> -parsePollyDefaultOptions(StringRef Params) { - return parsePollyOptions(Params, false); -} - -static llvm::Expected<PollyPassOptions> -parsePollyCustomOptions(StringRef Params) { - return parsePollyOptions(Params, true); -} - /// Register Polly passes such that they form a polyhedral optimizer. /// /// The individual Polly passes are registered in the pass manager such that @@ -461,12 +305,77 @@ static void buildCommonPollyPipeline(FunctionPassManager &PM, OptimizationLevel Level, bool EnableForOpt) { PassBuilder PB; + ScopPassManager SPM; + + PM.addPass(CodePreparationPass()); + + // TODO add utility passes for the various command line options, once they're + // ported + + if (PollyDetectOnly) { + // Don't add more passes other than the ScopPassManager's detection passes. + PM.addPass(createFunctionToScopPassAdaptor(std::move(SPM))); + return; + } - ExitOnError Err("Inconsistent Polly configuration: "); - PollyPassOptions &&Opts = - Err(parsePollyOptions(StringRef(), /*IsCustom=*/false)); - PM.addPass(PollyFunctionPass(Opts)); + if (PollyViewer) + PM.addPass(ScopViewer()); + if (PollyOnlyViewer) + PM.addPass(ScopOnlyViewer()); + if (PollyPrinter) + PM.addPass(ScopPrinter()); + if (PollyOnlyPrinter) + PM.addPass(ScopOnlyPrinter()); + if (EnableSimplify) + SPM.addPass(SimplifyPass(0)); + if (EnableForwardOpTree) + SPM.addPass(ForwardOpTreePass()); + if (EnableDeLICM) + SPM.addPass(DeLICMPass()); + if (EnableSimplify) + SPM.addPass(SimplifyPass(1)); + + if (ImportJScop) + SPM.addPass(JSONImportPass()); + + if (DeadCodeElim) + SPM.addPass(DeadCodeElimPass()); + + if (FullyIndexedStaticExpansion) + SPM.addPass(MaximalStaticExpansionPass()); + + if (EnablePruneUnprofitable) + SPM.addPass(PruneUnprofitablePass()); + + switch (Optimizer) { + case OPTIMIZER_NONE: + break; /* Do nothing */ + case OPTIMIZER_ISL: + SPM.addPass(IslScheduleOptimizerPass()); + break; + } + + if (ExportJScop) + SPM.addPass(JSONExportPass()); + + if (!EnableForOpt) + return; + + switch (CodeGeneration) { + case CODEGEN_AST: + SPM.addPass( + llvm::RequireAnalysisPass<IslAstAnalysis, Scop, ScopAnalysisManager, + ScopStandardAnalysisResults &, + SPMUpdater &>()); + break; + case CODEGEN_FULL: + SPM.addPass(CodeGenerationPass()); + break; + case CODEGEN_NONE: + break; + } + PM.addPass(createFunctionToScopPassAdaptor(std::move(SPM))); PM.addPass(PB.buildFunctionSimplificationPipeline( Level, llvm::ThinOrFullLTOPhase::None)); // Cleanup @@ -583,9 +492,8 @@ parseCGPipeline(StringRef Name, llvm::CGSCCPassManager &CGPM, return false; } -static llvm::Expected<bool> +static bool parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM, - PassInstrumentationCallbacks *PIC, ArrayRef<PassBuilder::PipelineElement> Pipeline) { if (llvm::parseAnalysisUtilityPasses<OwningScopAnalysisManagerFunctionProxy>( "polly-scop-analyses", Name, FPM)) @@ -597,13 +505,8 @@ parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM, FPM)) \ return true; -#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER) \ - if (PassBuilder::checkParametrizedPassName(Name, NAME)) { \ - auto ExpectedOpts = PassBuilder::parsePassParameters(PARSER, Name, NAME); \ - if (!ExpectedOpts) \ - return ExpectedOpts.takeError(); \ - auto &&Opts = *ExpectedOpts; \ - (void)Opts; \ +#define FUNCTION_PASS(NAME, CREATE_PASS) \ + if (Name == NAME) { \ FPM.addPass(CREATE_PASS); \ return true; \ } @@ -689,28 +592,6 @@ parseTopLevelPipeline(llvm::ModulePassManager &MPM, return true; } -static llvm::Expected<bool> -parseModulePipeline(StringRef Name, llvm::ModulePassManager &MPM, - PassInstrumentationCallbacks *PIC, - ArrayRef<PassBuilder::PipelineElement> Pipeline) { - assert(Pipeline.empty()); - -#define MODULE_PASS(NAME, CREATE_PASS, PARSER) \ - if (PassBuilder::checkParametrizedPassName(Name, NAME)) { \ - auto ExpectedOpts = PassBuilder::parsePassParameters(PARSER, Name, NAME); \ - if (!ExpectedOpts) \ - return ExpectedOpts.takeError(); \ - auto &&Opts = *ExpectedOpts; \ - (void)Opts; \ - MPM.addPass(CREATE_PASS); \ - return true; \ - } - -#include "PollyPasses.def" - - return false; -} - /// Register Polly to be available as an optimizer /// /// @@ -739,36 +620,10 @@ parseModulePipeline(StringRef Name, llvm::ModulePassManager &MPM, /// handle LICMed code to make it useful. void registerPollyPasses(PassBuilder &PB) { PassInstrumentationCallbacks *PIC = PB.getPassInstrumentationCallbacks(); - -#define MODULE_PASS(NAME, CREATE_PASS, PARSER) \ - { \ - std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts; \ - (void)Opts; \ - PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); \ - } -#define CGSCC_PASS(NAME, CREATE_PASS, PARSER) \ - { \ - std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts; \ - (void)Opts; \ - PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); \ - } -#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER) \ - { \ - std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts; \ - (void)Opts; \ - PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME); \ - } -#include "PollyPasses.def" - PB.registerAnalysisRegistrationCallback([PIC](FunctionAnalysisManager &FAM) { registerFunctionAnalyses(FAM, PIC); }); - PB.registerPipelineParsingCallback( - [PIC](StringRef Name, FunctionPassManager &FPM, - ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool { - ExitOnError Err("Unable to parse Polly module pass: "); - return Err(parseFunctionPipeline(Name, FPM, PIC, Pipeline)); - }); + PB.registerPipelineParsingCallback(parseFunctionPipeline); PB.registerPipelineParsingCallback( [PIC](StringRef Name, FunctionPassManager &FPM, ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool { @@ -780,12 +635,6 @@ void registerPollyPasses(PassBuilder &PB) { ExitOnError Err("Unable to parse Polly call graph pass: "); return Err(parseCGPipeline(Name, CGPM, PIC, Pipeline)); }); - PB.registerPipelineParsingCallback( - [PIC](StringRef Name, ModulePassManager &MPM, - ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool { - ExitOnError Err("Unable to parse Polly module pass: "); - return Err(parseModulePipeline(Name, MPM, PIC, Pipeline)); - }); PB.registerParseTopLevelPipelineCallback( [PIC](llvm::ModulePassManager &MPM, ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool { diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp index cf0ec4432f747..a2328d1bbb3cf 100644 --- a/polly/lib/Support/ScopHelper.cpp +++ b/polly/lib/Support/ScopHelper.cpp @@ -206,6 +206,18 @@ void polly::splitEntryBlockForAlloca(BasicBlock *EntryBlock, DominatorTree *DT, splitBlock(EntryBlock, I, DT, LI, RI); } +void polly::splitEntryBlockForAlloca(BasicBlock *EntryBlock, Pass *P) { + auto *DTWP = P->getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *LIWP = P->getAnalysisIfAvailable<LoopInfoWrapperPass>(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + RegionInfoPass *RIP = P->getAnalysisIfAvailable<RegionInfoPass>(); + RegionInfo *RI = RIP ? &RIP->getRegionInfo() : nullptr; + + // splitBlock updates DT, LI and RI. + polly::splitEntryBlockForAlloca(EntryBlock, DT, LI, RI); +} + void polly::recordAssumption(polly::RecordedAssumptionsTy *RecordedAssumptions, polly::AssumptionKind Kind, isl::set Set, DebugLoc Loc, polly::AssumptionSign Sign, diff --git a/polly/lib/Transform/Canonicalization.cpp b/polly/lib/Transform/Canonicalization.cpp index cd7195f5374df..1be560e64af40 100644 --- a/polly/lib/Transform/Canonicalization.cpp +++ b/polly/lib/Transform/Canonicalization.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "polly/Canonicalization.h" +#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -38,6 +39,24 @@ static cl::opt<bool> cl::desc("Run an early inliner pass before Polly"), cl::Hidden, cl::cat(PollyCategory)); +void polly::registerCanonicalicationPasses(llvm::legacy::PassManagerBase &PM) { + bool UseMemSSA = true; + PM.add(llvm::createPromoteMemoryToRegisterPass()); + PM.add(llvm::createEarlyCSEPass(UseMemSSA)); + PM.add(llvm::createInstructionCombiningPass()); + PM.add(llvm::createCFGSimplificationPass()); + PM.add(llvm::createTailCallEliminationPass()); + PM.add(llvm::createCFGSimplificationPass()); + PM.add(llvm::createReassociatePass()); + if (PollyInliner) { + PM.add(llvm::createPromoteMemoryToRegisterPass()); + PM.add(llvm::createCFGSimplificationPass()); + PM.add(llvm::createInstructionCombiningPass()); + PM.add(createBarrierNoopPass()); + } + PM.add(llvm::createInstructionCombiningPass()); +} + /// Adapted from llvm::PassBuilder::buildInlinerPipeline static ModuleInlinerWrapperPass buildInlinePasses(llvm::OptimizationLevel Level) { @@ -106,3 +125,49 @@ polly::buildCanonicalicationPassesForNPM(llvm::ModulePassManager &MPM, return FPM; } + +namespace { +class PollyCanonicalize final : public ModulePass { + PollyCanonicalize(const PollyCanonicalize &) = delete; + const PollyCanonicalize &operator=(const PollyCanonicalize &) = delete; + +public: + static char ID; + + explicit PollyCanonicalize() : ModulePass(ID) {} + ~PollyCanonicalize(); + + /// @name FunctionPass interface. + //@{ + void getAnalysisUsage(AnalysisUsage &AU) const override; + void releaseMemory() override; + bool runOnModule(Module &M) override; + void print(raw_ostream &OS, const Module *) const override; + //@} +}; +} // namespace + +PollyCanonicalize::~PollyCanonicalize() {} + +void PollyCanonicalize::getAnalysisUsage(AnalysisUsage &AU) const {} + +void PollyCanonicalize::releaseMemory() {} + +bool PollyCanonicalize::runOnModule(Module &M) { + legacy::PassManager PM; + registerCanonicalicationPasses(PM); + PM.run(M); + + return true; +} + +void PollyCanonicalize::print(raw_ostream &OS, const Module *) const {} + +char PollyCanonicalize::ID = 0; + +Pass *polly::createPollyCanonicalizePass() { return new PollyCanonicalize(); } + +INITIALIZE_PASS_BEGIN(PollyCanonicalize, "polly-canonicalize", + "Polly - Run canonicalization passes", false, false) +INITIALIZE_PASS_END(PollyCanonicalize, "polly-canonicalize", + "Polly - Run canonicalization passes", false, false) diff --git a/polly/lib/Transform/CodePreparation.cpp b/polly/lib/Transform/CodePreparation.cpp index 5b96c865ad80f..d045fb6b62c90 100644 --- a/polly/lib/Transform/CodePreparation.cpp +++ b/polly/lib/Transform/CodePreparation.cpp @@ -16,11 +16,13 @@ //===----------------------------------------------------------------------===// #include "polly/CodePreparation.h" +#include "polly/LinkAllPasses.h" #include "polly/Support/ScopHelper.h" #include "llvm/Analysis/DominanceFrontier.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/InitializePasses.h" using namespace llvm; using namespace polly; @@ -45,6 +47,32 @@ static bool runCodePreprationImpl(Function &F, DominatorTree *DT, LoopInfo *LI, return true; } +namespace { + +/// Prepare the IR for the scop detection. +/// +class CodePreparation final : public FunctionPass { + CodePreparation(const CodePreparation &) = delete; + const CodePreparation &operator=(const CodePreparation &) = delete; + + void clear(); + +public: + static char ID; + + explicit CodePreparation() : FunctionPass(ID) {} + ~CodePreparation(); + + /// @name FunctionPass interface. + //@{ + void getAnalysisUsage(AnalysisUsage &AU) const override; + void releaseMemory() override; + bool runOnFunction(Function &F) override; + void print(raw_ostream &OS, const Module *) const override; + //@} +}; +} // namespace + PreservedAnalyses CodePreparationPass::run(Function &F, FunctionAnalysisManager &FAM) { auto &DT = FAM.getResult<DominatorTreeAnalysis>(F); @@ -58,3 +86,44 @@ PreservedAnalyses CodePreparationPass::run(Function &F, PA.preserve<LoopAnalysis>(); return PA; } + +void CodePreparation::clear() {} + +CodePreparation::~CodePreparation() { clear(); } + +void CodePreparation::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfoWrapperPass>(); + + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<RegionInfoPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<DominanceFrontierWrapperPass>(); +} + +bool CodePreparation::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + RegionInfo *RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); + + runCodePreprationImpl(F, DT, LI, RI); + + return true; +} + +void CodePreparation::releaseMemory() { clear(); } + +void CodePreparation::print(raw_ostream &OS, const Module *) const {} + +char CodePreparation::ID = 0; +char &polly::CodePreparationID = CodePreparation::ID; + +Pass *polly::createCodePreparationPass() { return new CodePreparation(); } + +INITIALIZE_PASS_BEGIN(CodePreparation, "polly-prepare", + "Polly - Prepare code for polly", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(CodePreparation, "polly-prepare", + "Polly - Prepare code for polly", false, false) diff --git a/polly/lib/Transform/DeLICM.cpp b/polly/lib/Transform/DeLICM.cpp index e8f2d951404f3..9a9768afe113e 100644 --- a/polly/lib/Transform/DeLICM.cpp +++ b/polly/lib/Transform/DeLICM.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "polly/DeLICM.h" +#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/ScopPass.h" @@ -24,6 +25,7 @@ #include "polly/ZoneAlgo.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "polly/Support/PollyDebug.h" #define DEBUG_TYPE "polly-delicm" @@ -33,10 +35,6 @@ using namespace llvm; namespace { -static cl::opt<bool> PollyPrintDeLICM("polly-print-delicm", - cl::desc("Polly - Print DeLICM/DePRE"), - cl::cat(PollyCategory)); - cl::opt<int> DelicmMaxOps("polly-delicm-max-ops", cl::desc("Maximum number of isl operations to invest for " @@ -1358,10 +1356,7 @@ class DeLICMImpl final : public ZoneAlgorithm { } /// Return whether at least one transformation been applied. - bool isModified() const { - return NumberOfTargetsMapped > 0 || NumberOfMappedValueScalars > 0 || - NumberOfMappedPHIScalars > 0; - } + bool isModified() const { return NumberOfTargetsMapped > 0; } }; static std::unique_ptr<DeLICMImpl> collapseToUnused(Scop &S, LoopInfo &LI) { @@ -1381,7 +1376,7 @@ static std::unique_ptr<DeLICMImpl> collapseToUnused(Scop &S, LoopInfo &LI) { return Impl; } -static std::unique_ptr<DeLICMImpl> runDeLICMImpl(Scop &S, LoopInfo &LI) { +static std::unique_ptr<DeLICMImpl> runDeLICM(Scop &S, LoopInfo &LI) { std::unique_ptr<DeLICMImpl> Impl = collapseToUnused(S, LI); Scop::ScopStatistics ScopStats = S.getStatistics(); @@ -1399,7 +1394,7 @@ static PreservedAnalyses runDeLICMUsingNPM(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &U, raw_ostream *OS) { LoopInfo &LI = SAR.LI; - std::unique_ptr<DeLICMImpl> Impl = runDeLICMImpl(S, LI); + std::unique_ptr<DeLICMImpl> Impl = runDeLICM(S, LI); if (OS) { *OS << "Printing analysis 'Polly - DeLICM/DePRE' for region: '" @@ -1422,8 +1417,88 @@ static PreservedAnalyses runDeLICMUsingNPM(Scop &S, ScopAnalysisManager &SAM, PA.preserveSet<AllAnalysesOn<Loop>>(); return PA; } + +class DeLICMWrapperPass final : public ScopPass { +private: + DeLICMWrapperPass(const DeLICMWrapperPass &) = delete; + const DeLICMWrapperPass &operator=(const DeLICMWrapperPass &) = delete; + + /// The pass implementation, also holding per-scop data. + std::unique_ptr<DeLICMImpl> Impl; + +public: + static char ID; + explicit DeLICMWrapperPass() : ScopPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredTransitive<ScopInfoRegionPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.setPreservesAll(); + } + + bool runOnScop(Scop &S) override { + // Free resources for previous scop's computation, if not yet done. + releaseMemory(); + + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + Impl = runDeLICM(S, LI); + + return Impl->isModified(); + } + + void printScop(raw_ostream &OS, Scop &S) const override { + if (!Impl) + return; + assert(Impl->getScop() == &S); + + OS << "DeLICM result:\n"; + Impl->print(OS); + } + + void releaseMemory() override { Impl.reset(); } +}; + +char DeLICMWrapperPass::ID; + +/// Print result from DeLICMWrapperPass. +class DeLICMPrinterLegacyPass final : public ScopPass { +public: + static char ID; + + DeLICMPrinterLegacyPass() : DeLICMPrinterLegacyPass(outs()) {} + explicit DeLICMPrinterLegacyPass(llvm::raw_ostream &OS) + : ScopPass(ID), OS(OS) {} + + bool runOnScop(Scop &S) override { + DeLICMWrapperPass &P = getAnalysis<DeLICMWrapperPass>(); + + OS << "Printing analysis '" << P.getPassName() << "' for region: '" + << S.getRegion().getNameStr() << "' in function '" + << S.getFunction().getName() << "':\n"; + P.printScop(OS, S); + + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<DeLICMWrapperPass>(); + AU.setPreservesAll(); + } + +private: + llvm::raw_ostream &OS; +}; + +char DeLICMPrinterLegacyPass::ID = 0; } // anonymous namespace +Pass *polly::createDeLICMWrapperPass() { return new DeLICMWrapperPass(); } + +llvm::Pass *polly::createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS) { + return new DeLICMPrinterLegacyPass(OS); +} + llvm::PreservedAnalyses polly::DeLICMPass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, @@ -1452,21 +1527,15 @@ bool polly::isConflicting( return Knowledge::isConflicting(Existing, Proposed, OS, Indent); } -bool polly::runDeLICM(Scop &S) { - LoopInfo &LI = *S.getLI(); - std::unique_ptr<DeLICMImpl> Impl = runDeLICMImpl(S, LI); - - if (PollyPrintDeLICM) { - outs() << "Printing analysis 'Polly - DeLICM/DePRE' for region: '" - << S.getName() << "' in function '" << S.getFunction().getName() - << "':\n"; - if (Impl) { - assert(Impl->getScop() == &S); - - outs() << "DeLICM result:\n"; - Impl->print(outs()); - } - } - - return Impl->isModified(); -} +INITIALIZE_PASS_BEGIN(DeLICMWrapperPass, "polly-delicm", "Polly - DeLICM/DePRE", + false, false) +INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(DeLICMWrapperPass, "polly-delicm", "Polly - DeLICM/DePRE", + false, false) + +INITIALIZE_PASS_BEGIN(DeLICMPrinterLegacyPass, "polly-print-delicm", + "Polly - Print DeLICM/DePRE", false, false) +INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass) +INITIALIZE_PASS_END(DeLICMPrinterLegacyPass, "polly-print-delicm", + "Polly - Print DeLICM/DePRE", false, false) diff --git a/polly/lib/Transform/DeadCodeElimination.cpp b/polly/lib/Transform/DeadCodeElimination.cpp index df95e5190431c..5cb89fec09fe8 100644 --- a/polly/lib/Transform/DeadCodeElimination.cpp +++ b/polly/lib/Transform/DeadCodeElimination.cpp @@ -33,6 +33,7 @@ #include "polly/DeadCodeElimination.h" #include "polly/DependenceInfo.h" +#include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" #include "llvm/Support/CommandLine.h" @@ -50,6 +51,20 @@ cl::opt<int> DCEPreciseSteps( "before the actual dead code elimination."), cl::init(-1), cl::cat(PollyCategory)); +class DeadCodeElimWrapperPass final : public ScopPass { +public: + static char ID; + explicit DeadCodeElimWrapperPass() : ScopPass(ID) {} + + /// Remove dead iterations from the schedule of @p S. + bool runOnScop(Scop &S) override; + + /// Register all analyses and transformation required. + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +char DeadCodeElimWrapperPass::ID = 0; + /// Return the set of live iterations. /// /// The set of live iterations are all iterations that write to memory and for @@ -129,19 +144,29 @@ static bool runDeadCodeElimination(Scop &S, int PreciseSteps, return S.restrictDomains(Live); } -} // namespace - -bool polly::runDeadCodeElim(Scop &S, DependenceAnalysis::Result &DA) { - const Dependences &Deps = DA.getDependences(Dependences::AL_Statement); +bool DeadCodeElimWrapperPass::runOnScop(Scop &S) { + auto &DI = getAnalysis<DependenceInfo>(); + const Dependences &Deps = DI.getDependences(Dependences::AL_Statement); bool Changed = runDeadCodeElimination(S, DCEPreciseSteps, Deps); // FIXME: We can probably avoid the recomputation of all dependences by // updating them explicitly. if (Changed) - DA.recomputeDependences(Dependences::AL_Statement); + DI.recomputeDependences(Dependences::AL_Statement); - return Changed; + return false; +} + +void DeadCodeElimWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<DependenceInfo>(); +} + +} // namespace + +Pass *polly::createDeadCodeElimWrapperPass() { + return new DeadCodeElimWrapperPass(); } llvm::PreservedAnalyses DeadCodeElimPass::run(Scop &S, ScopAnalysisManager &SAM, @@ -166,3 +191,10 @@ llvm::PreservedAnalyses DeadCodeElimPass::run(Scop &S, ScopAnalysisManager &SAM, PA.preserveSet<AllAnalysesOn<Loop>>(); return PA; } + +INITIALIZE_PASS_BEGIN(DeadCodeElimWrapperPass, "polly-dce", + "Polly - Remove dead iterations", false, false) +INITIALIZE_PASS_DEPENDENCY(DependenceInfo) +INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass) +INITIALIZE_PASS_END(DeadCodeElimWrapperPass, "polly-dce", + "Polly - Remove dead iterations", false, false) diff --git a/polly/lib/Transform/FlattenSchedule.cpp b/polly/lib/Transform/FlattenSchedule.cpp index 35a8ce6877036..f514ef359ba07 100644 --- a/polly/lib/Transform/FlattenSchedule.cpp +++ b/polly/lib/Transform/FlattenSchedule.cpp @@ -14,7 +14,6 @@ #include "polly/FlattenSchedule.h" #include "polly/FlattenAlgo.h" -#include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/ScopPass.h" #include "polly/Support/ISLOStream.h" @@ -27,10 +26,6 @@ using namespace llvm; namespace { -static cl::opt<bool> PollyPrintFlattenSchedule("polly-print-flatten-schedule", - cl::desc("A polly pass"), - cl::cat(PollyCategory)); - /// Print a schedule to @p OS. /// /// Prints the schedule for each statements on a new line. @@ -39,45 +34,119 @@ void printSchedule(raw_ostream &OS, const isl::union_map &Schedule, for (isl::map Map : Schedule.get_map_list()) OS.indent(indent) << Map << "\n"; } -} // namespace -void polly::runFlattenSchedulePass(Scop &S) { - // Keep a reference to isl_ctx to ensure that it is not freed before we free - // OldSchedule. - auto IslCtx = S.getSharedIslCtx(); +/// Flatten the schedule stored in an polly::Scop. +class FlattenSchedule final : public ScopPass { +private: + FlattenSchedule(const FlattenSchedule &) = delete; + const FlattenSchedule &operator=(const FlattenSchedule &) = delete; + + std::shared_ptr<isl_ctx> IslCtx; + isl::union_map OldSchedule; + +public: + static char ID; + explicit FlattenSchedule() : ScopPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredTransitive<ScopInfoRegionPass>(); + AU.setPreservesAll(); + } + + bool runOnScop(Scop &S) override { + // Keep a reference to isl_ctx to ensure that it is not freed before we free + // OldSchedule. + IslCtx = S.getSharedIslCtx(); - POLLY_DEBUG(dbgs() << "Going to flatten old schedule:\n"); - auto OldSchedule = S.getSchedule(); - POLLY_DEBUG(printSchedule(dbgs(), OldSchedule, 2)); + POLLY_DEBUG(dbgs() << "Going to flatten old schedule:\n"); + OldSchedule = S.getSchedule(); + POLLY_DEBUG(printSchedule(dbgs(), OldSchedule, 2)); - auto Domains = S.getDomains(); - auto RestrictedOldSchedule = OldSchedule.intersect_domain(Domains); - POLLY_DEBUG(dbgs() << "Old schedule with domains:\n"); - POLLY_DEBUG(printSchedule(dbgs(), RestrictedOldSchedule, 2)); + auto Domains = S.getDomains(); + auto RestrictedOldSchedule = OldSchedule.intersect_domain(Domains); + POLLY_DEBUG(dbgs() << "Old schedule with domains:\n"); + POLLY_DEBUG(printSchedule(dbgs(), RestrictedOldSchedule, 2)); - auto NewSchedule = flattenSchedule(RestrictedOldSchedule); + auto NewSchedule = flattenSchedule(RestrictedOldSchedule); - POLLY_DEBUG(dbgs() << "Flattened new schedule:\n"); - POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2)); + POLLY_DEBUG(dbgs() << "Flattened new schedule:\n"); + POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2)); - NewSchedule = NewSchedule.gist_domain(Domains); - POLLY_DEBUG(dbgs() << "Gisted, flattened new schedule:\n"); - POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2)); + NewSchedule = NewSchedule.gist_domain(Domains); + POLLY_DEBUG(dbgs() << "Gisted, flattened new schedule:\n"); + POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2)); - S.setSchedule(NewSchedule); + S.setSchedule(NewSchedule); + return false; + } - if (PollyPrintFlattenSchedule) { - outs() - << "Printing analysis 'Polly - Print flattened schedule' for region: '" - << S.getRegion().getNameStr() << "' in function '" - << S.getFunction().getName() << "':\n"; + void printScop(raw_ostream &OS, Scop &S) const override { + OS << "Schedule before flattening {\n"; + printSchedule(OS, OldSchedule, 4); + OS << "}\n\n"; - outs() << "Schedule before flattening {\n"; - printSchedule(outs(), OldSchedule, 4); - outs() << "}\n\n"; + OS << "Schedule after flattening {\n"; + printSchedule(OS, S.getSchedule(), 4); + OS << "}\n"; + } - outs() << "Schedule after flattening {\n"; - printSchedule(outs(), S.getSchedule(), 4); - outs() << "}\n"; + void releaseMemory() override { + OldSchedule = {}; + IslCtx.reset(); } +}; + +char FlattenSchedule::ID; + +/// Print result from FlattenSchedule. +class FlattenSchedulePrinterLegacyPass final : public ScopPass { +public: + static char ID; + + FlattenSchedulePrinterLegacyPass() + : FlattenSchedulePrinterLegacyPass(outs()) {} + explicit FlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS) + : ScopPass(ID), OS(OS) {} + + bool runOnScop(Scop &S) override { + FlattenSchedule &P = getAnalysis<FlattenSchedule>(); + + OS << "Printing analysis '" << P.getPassName() << "' for region: '" + << S.getRegion().getNameStr() << "' in function '" + << S.getFunction().getName() << "':\n"; + P.printScop(OS, S); + + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<FlattenSchedule>(); + AU.setPreservesAll(); + } + +private: + llvm::raw_ostream &OS; +}; + +char FlattenSchedulePrinterLegacyPass::ID = 0; +} // anonymous namespace + +Pass *polly::createFlattenSchedulePass() { return new FlattenSchedule(); } + +Pass *polly::createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS) { + return new FlattenSchedulePrinterLegacyPass(OS); } + +INITIALIZE_PASS_BEGIN(FlattenSchedule, "polly-flatten-schedule", + "Polly - Flatten schedule", false, false) +INITIALIZE_PASS_END(FlattenSchedule, "polly-flatten-schedule", + "Polly - Flatten schedule", false, false) + +INITIALIZE_PASS_BEGIN(FlattenSchedulePrinterLegacyPass, + "polly-print-flatten-schedule", + "Polly - Print flattened schedule", false, false) +INITIALIZE_PASS_DEPENDENCY(FlattenSchedule) +INITIALIZE_PASS_END(FlattenSchedulePrinterLegacyPass, + "polly-print-flatten-schedule", + "Polly - Print flattened schedule", false, false) diff --git a/polly/lib/Transform/ForwardOpTree.cpp b/polly/lib/Transform/ForwardOpTree.cpp index 24d4a4af6e681..e9be6c9cdcc27 100644 --- a/polly/lib/Transform/ForwardOpTree.cpp +++ b/polly/lib/Transform/ForwardOpTree.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" @@ -61,11 +62,6 @@ static cl::opt<unsigned> "analysis; 0=no limit"), cl::init(1000000), cl::cat(PollyCategory), cl::Hidden); -static cl::opt<bool> - PollyPrintOptree("polly-print-optree", - cl::desc("Polly - Print forward operand tree result"), - cl::cat(PollyCategory)); - STATISTIC(KnownAnalyzed, "Number of successfully analyzed SCoPs"); STATISTIC(KnownOutOfQuota, "Analyses aborted because max_operations was reached"); @@ -1034,8 +1030,8 @@ class ForwardOpTreeImpl final : ZoneAlgorithm { bool isModified() const { return Modified; } }; -static std::unique_ptr<ForwardOpTreeImpl> runForwardOpTreeImpl(Scop &S, - LoopInfo &LI) { +static std::unique_ptr<ForwardOpTreeImpl> runForwardOpTree(Scop &S, + LoopInfo &LI) { std::unique_ptr<ForwardOpTreeImpl> Impl; { IslMaxOperationsGuard MaxOpGuard(S.getIslCtx().get(), MaxOps, false); @@ -1077,7 +1073,7 @@ runForwardOpTreeUsingNPM(Scop &S, ScopAnalysisManager &SAM, raw_ostream *OS) { LoopInfo &LI = SAR.LI; - std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTreeImpl(S, LI); + std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTree(S, LI); if (OS) { *OS << "Printing analysis 'Polly - Forward operand tree' for region: '" << S.getName() << "' in function '" << S.getFunction().getName() @@ -1098,8 +1094,99 @@ runForwardOpTreeUsingNPM(Scop &S, ScopAnalysisManager &SAM, PA.preserveSet<AllAnalysesOn<Loop>>(); return PA; } + +/// Pass that redirects scalar reads to array elements that are known to contain +/// the same value. +/// +/// This reduces the number of scalar accesses and therefore potentially +/// increases the freedom of the scheduler. In the ideal case, all reads of a +/// scalar definition are redirected (We currently do not care about removing +/// the write in this case). This is also useful for the main DeLICM pass as +/// there are less scalars to be mapped. +class ForwardOpTreeWrapperPass final : public ScopPass { +private: + /// The pass implementation, also holding per-scop data. + std::unique_ptr<ForwardOpTreeImpl> Impl; + +public: + static char ID; + + explicit ForwardOpTreeWrapperPass() : ScopPass(ID) {} + ForwardOpTreeWrapperPass(const ForwardOpTreeWrapperPass &) = delete; + ForwardOpTreeWrapperPass & + operator=(const ForwardOpTreeWrapperPass &) = delete; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredTransitive<ScopInfoRegionPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.setPreservesAll(); + } + + bool runOnScop(Scop &S) override { + // Free resources for previous SCoP's computation, if not yet done. + releaseMemory(); + + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + + Impl = runForwardOpTree(S, LI); + + return false; + } + + void printScop(raw_ostream &OS, Scop &S) const override { + if (!Impl) + return; + + assert(Impl->getScop() == &S); + Impl->print(OS); + } + + void releaseMemory() override { Impl.reset(); } +}; // class ForwardOpTree + +char ForwardOpTreeWrapperPass::ID; + +/// Print result from ForwardOpTreeWrapperPass. +class ForwardOpTreePrinterLegacyPass final : public ScopPass { +public: + static char ID; + + ForwardOpTreePrinterLegacyPass() : ForwardOpTreePrinterLegacyPass(outs()) {} + explicit ForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS) + : ScopPass(ID), OS(OS) {} + + bool runOnScop(Scop &S) override { + ForwardOpTreeWrapperPass &P = getAnalysis<ForwardOpTreeWrapperPass>(); + + OS << "Printing analysis '" << P.getPassName() << "' for region: '" + << S.getRegion().getNameStr() << "' in function '" + << S.getFunction().getName() << "':\n"; + P.printScop(OS, S); + + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<ForwardOpTreeWrapperPass>(); + AU.setPreservesAll(); + } + +private: + llvm::raw_ostream &OS; +}; + +char ForwardOpTreePrinterLegacyPass::ID = 0; } // namespace +Pass *polly::createForwardOpTreeWrapperPass() { + return new ForwardOpTreeWrapperPass(); +} + +Pass *polly::createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS) { + return new ForwardOpTreePrinterLegacyPass(OS); +} + llvm::PreservedAnalyses ForwardOpTreePass::run(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, @@ -1113,20 +1200,14 @@ ForwardOpTreePrinterPass::run(Scop &S, ScopAnalysisManager &SAM, return runForwardOpTreeUsingNPM(S, SAM, SAR, U, &OS); } -bool polly::runForwardOpTree(Scop &S) { - LoopInfo &LI = *S.getLI(); - - std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTreeImpl(S, LI); - if (PollyPrintOptree) { - outs() << "Printing analysis 'Polly - Forward operand tree' for region: '" - << S.getName() << "' in function '" << S.getFunction().getName() - << "':\n"; - if (Impl) { - assert(Impl->getScop() == &S); - - Impl->print(outs()); - } - } - - return Impl->isModified(); -} +INITIALIZE_PASS_BEGIN(ForwardOpTreeWrapperPass, "polly-optree", + "Polly - Forward operand tree", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(ForwardOpTreeWrapperPass, "polly-optree", + "Polly - Forward operand tree", false, false) + +INITIALIZE_PASS_BEGIN(ForwardOpTreePrinterLegacyPass, "polly-print-optree", + "Polly - Print forward operand tree result", false, false) +INITIALIZE_PASS_DEPENDENCY(ForwardOpTreeWrapperPass) +INITIALIZE_PASS_END(ForwardOpTreePrinterLegacyPass, "polly-print-optree", + "Polly - Print forward operand tree result", false, false) diff --git a/polly/lib/Transform/MaximalStaticExpansion.cpp b/polly/lib/Transform/MaximalStaticExpansion.cpp index 62a4d251875c5..0719840f74a79 100644 --- a/polly/lib/Transform/MaximalStaticExpansion.cpp +++ b/polly/lib/Transform/MaximalStaticExpansion.cpp @@ -13,13 +13,14 @@ #include "polly/MaximalStaticExpansion.h" #include "polly/DependenceInfo.h" -#include "polly/Options.h" +#include "polly/LinkAllPasses.h" #include "polly/ScopInfo.h" #include "polly/ScopPass.h" #include "polly/Support/ISLTools.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/InitializePasses.h" #include "isl/isl-noexceptions.h" #include "isl/union_map.h" #include <cassert> @@ -34,10 +35,28 @@ using namespace polly; namespace { -static cl::opt<bool> - PollyPrintMSE("polly-print-mse", - cl::desc("Polly - Print Maximal static expansion of SCoP"), - cl::cat(PollyCategory)); +class MaximalStaticExpanderWrapperPass final : public ScopPass { +public: + static char ID; + + explicit MaximalStaticExpanderWrapperPass() : ScopPass(ID) {} + + ~MaximalStaticExpanderWrapperPass() override = default; + + /// Expand the accesses of the SCoP. + /// + /// @param S The SCoP that must be expanded. + bool runOnScop(Scop &S) override; + + /// Print the SCoP. + /// + /// @param OS The stream where to print. + /// @param S The SCop that must be printed. + void printScop(raw_ostream &OS, Scop &S) const override; + + /// Register all analyses and transformations required. + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; #ifndef NDEBUG /// Whether a dimension of a set is bounded (lower and upper) by a constant, @@ -439,8 +458,8 @@ class MaximalStaticExpansionImpl { }; static std::unique_ptr<MaximalStaticExpansionImpl> -runMaximalStaticExpansionImpl(Scop &S, OptimizationRemarkEmitter &ORE, - const Dependences &D) { +runMaximalStaticExpansion(Scop &S, OptimizationRemarkEmitter &ORE, + const Dependences &D) { auto Dependences = D.getDependences(Dependences::TYPE_RAW); std::unique_ptr<MaximalStaticExpansionImpl> Impl = @@ -459,7 +478,7 @@ static PreservedAnalyses runMSEUsingNPM(Scop &S, ScopAnalysisManager &SAM, auto &D = DI.getDependences(Dependences::AL_Reference); std::unique_ptr<MaximalStaticExpansionImpl> Impl = - runMaximalStaticExpansionImpl(S, ORE, D); + runMaximalStaticExpansion(S, ORE, D); if (OS) { *OS << "Printing analysis 'Polly - Maximal static expansion of SCoP' for " @@ -492,24 +511,42 @@ MaximalStaticExpansionPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, return runMSEUsingNPM(S, SAM, SAR, &OS); } -void polly::runMaximalStaticExpansion(Scop &S, DependenceAnalysis::Result &DI) { - OptimizationRemarkEmitter ORE(&S.getFunction()); +char MaximalStaticExpanderWrapperPass::ID = 0; + +bool MaximalStaticExpanderWrapperPass::runOnScop(Scop &S) { + // Get the ORE from OptimizationRemarkEmitterWrapperPass. + OptimizationRemarkEmitter *ORE = + &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + // Get the RAW Dependences. + auto &DI = getAnalysis<DependenceInfo>(); auto &D = DI.getDependences(Dependences::AL_Reference); std::unique_ptr<MaximalStaticExpansionImpl> Impl = - runMaximalStaticExpansionImpl(S, ORE, D); + runMaximalStaticExpansion(S, *ORE, D); - if (PollyPrintMSE) { - outs() - << "Printing analysis 'Polly - Maximal static expansion of SCoP' for " - "region: '" - << S.getName() << "' in function '" << S.getFunction().getName() - << "':\n"; + return false; +} - if (Impl) { - outs() << "MSE result:\n"; - Impl->print(llvm::outs()); - } - } +void MaximalStaticExpanderWrapperPass::printScop(raw_ostream &OS, + Scop &S) const { + S.print(OS, false); } + +void MaximalStaticExpanderWrapperPass::getAnalysisUsage( + AnalysisUsage &AU) const { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<DependenceInfo>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); +} + +Pass *polly::createMaximalStaticExpansionPass() { + return new MaximalStaticExpanderWrapperPass(); +} + +INITIALIZE_PASS_BEGIN(MaximalStaticExpanderWrapperPass, "polly-mse", + "Polly - Maximal static expansion of SCoP", false, false); +INITIALIZE_PASS_DEPENDENCY(DependenceInfo); +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); +INITIALIZE_PASS_END(MaximalStaticExpanderWrapperPass, "polly-mse", + "Polly - Maximal static expansion of SCoP", false, false) diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index 6acdd6862c4cc..0888ebd7a9362 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -57,6 +57,7 @@ #include "llvm/ADT/Sequence.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "isl/options.h" @@ -197,10 +198,6 @@ static cl::opt<bool> OptimizedScops( "transformations is applied on the schedule tree"), cl::cat(PollyCategory)); -static cl::opt<bool> PollyPrintOptIsl("polly-print-opt-isl", - cl::desc("A polly pass"), - cl::cat(PollyCategory)); - STATISTIC(ScopsProcessed, "Number of scops processed"); STATISTIC(ScopsRescheduled, "Number of scops rescheduled"); STATISTIC(ScopsOptimized, "Number of scops optimized"); @@ -641,6 +638,34 @@ bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S, return changed; } +class IslScheduleOptimizerWrapperPass final : public ScopPass { +public: + static char ID; + + explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {} + + /// Optimize the schedule of the SCoP @p S. + bool runOnScop(Scop &S) override; + + /// Print the new schedule for the SCoP @p S. + void printScop(raw_ostream &OS, Scop &S) const override; + + /// Register all analyses and transformation required. + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Release the internal memory. + void releaseMemory() override { + LastSchedule = {}; + IslCtx.reset(); + } + +private: + std::shared_ptr<isl_ctx> IslCtx; + isl::schedule LastSchedule; +}; + +char IslScheduleOptimizerWrapperPass::ID = 0; + #ifndef NDEBUG static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule, StringRef Desc) { @@ -708,7 +733,7 @@ static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { &Version); } -static void runIslScheduleOptimizerImpl( +static void runIslScheduleOptimizer( Scop &S, function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps, TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, @@ -925,6 +950,30 @@ static void runIslScheduleOptimizerImpl( errs() << S; } +bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) { + releaseMemory(); + + Function &F = S.getFunction(); + IslCtx = S.getSharedIslCtx(); + + auto getDependences = + [this](Dependences::AnalysisLevel) -> const Dependences & { + return getAnalysis<DependenceInfo>().getDependences( + Dependences::AL_Statement); + }; + OptimizationRemarkEmitter &ORE = + getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + TargetTransformInfo *TTI = + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + bool DepsChanged = false; + runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule, + DepsChanged); + if (DepsChanged) + getAnalysis<DependenceInfo>().abandonDependences(); + return false; +} + static void runScheduleOptimizerPrinter(raw_ostream &OS, isl::schedule LastSchedule) { isl_printer *p; @@ -948,8 +997,36 @@ static void runScheduleOptimizerPrinter(raw_ostream &OS, free(ScheduleStr); } +void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const { + runScheduleOptimizerPrinter(OS, LastSchedule); +} + +void IslScheduleOptimizerWrapperPass::getAnalysisUsage( + AnalysisUsage &AU) const { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<DependenceInfo>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + + AU.addPreserved<DependenceInfo>(); + AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); +} + } // namespace +Pass *polly::createIslScheduleOptimizerWrapperPass() { + return new IslScheduleOptimizerWrapperPass(); +} + +INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl", + "Polly - Optimize schedule of SCoP", false, false); +INITIALIZE_PASS_DEPENDENCY(DependenceInfo); +INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass); +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass); +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass); +INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl", + "Polly - Optimize schedule of SCoP", false, false) + static llvm::PreservedAnalyses runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &U, @@ -962,7 +1039,7 @@ runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM, TargetTransformInfo *TTI = &SAR.TTI; isl::schedule LastSchedule; bool DepsChanged = false; - runIslScheduleOptimizerImpl(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged); + runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged); if (DepsChanged) Deps.abandonDependences(); @@ -988,23 +1065,52 @@ IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM, return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS); } -void polly::runIslScheduleOptimizer(Scop &S, TargetTransformInfo *TTI, - DependenceAnalysis::Result &Deps) { - auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & { - return Deps.getDependences(Dependences::AL_Statement); - }; - OptimizationRemarkEmitter ORE(&S.getFunction()); - isl::schedule LastSchedule; - bool DepsChanged = false; - runIslScheduleOptimizerImpl(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged); - if (DepsChanged) - Deps.abandonDependences(); +//===----------------------------------------------------------------------===// - if (PollyPrintOptIsl) { - outs() - << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '" - << S.getName() << "' in function '" << S.getFunction().getName() - << "':\n"; - runScheduleOptimizerPrinter(outs(), LastSchedule); +namespace { +/// Print result from IslScheduleOptimizerWrapperPass. +class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass { +public: + static char ID; + + IslScheduleOptimizerPrinterLegacyPass() + : IslScheduleOptimizerPrinterLegacyPass(outs()) {} + explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS) + : ScopPass(ID), OS(OS) {} + + bool runOnScop(Scop &S) override { + IslScheduleOptimizerWrapperPass &P = + getAnalysis<IslScheduleOptimizerWrapperPass>(); + + OS << "Printing analysis '" << P.getPassName() << "' for region: '" + << S.getRegion().getNameStr() << "' in function '" + << S.getFunction().getName() << "':\n"; + P.printScop(OS, S); + + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<IslScheduleOptimizerWrapperPass>(); + AU.setPreservesAll(); } + +private: + llvm::raw_ostream &OS; +}; + +char IslScheduleOptimizerPrinterLegacyPass::ID = 0; +} // namespace + +Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) { + return new IslScheduleOptimizerPrinterLegacyPass(OS); } + +INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass, + "polly-print-opt-isl", + "Polly - Print optimizer schedule of SCoP", false, false); +INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass) +INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass, + "polly-print-opt-isl", + "Polly - Print optimizer schedule of SCoP", false, false) diff --git a/polly/lib/Transform/ScopInliner.cpp b/polly/lib/Transform/ScopInliner.cpp index 8e7a0dedaf533..c04ba3498339e 100644 --- a/polly/lib/Transform/ScopInliner.cpp +++ b/polly/lib/Transform/ScopInliner.cpp @@ -95,7 +95,53 @@ template <typename SCC_t> bool runScopInlinerImpl(Function *F, SCC_t &SCC) { return Changed; } + +class ScopInlinerWrapperPass final : public CallGraphSCCPass { + using llvm::Pass::doInitialization; + +public: + static char ID; + + ScopInlinerWrapperPass() : CallGraphSCCPass(ID) {} + + bool doInitialization(CallGraph &CG) override { + if (!polly::PollyAllowFullFunction) { + report_fatal_error( + "Aborting from ScopInliner because it only makes sense to run with " + "-polly-allow-full-function. " + "The heurtistic for ScopInliner checks that the full function is a " + "Scop, which happens if and only if polly-allow-full-function is " + " enabled. " + " If not, the entry block is not included in the Scop"); + } + return true; + } + + bool runOnSCC(CallGraphSCC &SCC) override { + Function *F = (*SCC.begin())->getFunction(); + return runScopInlinerImpl(F, SCC); + }; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + CallGraphSCCPass::getAnalysisUsage(AU); + } +}; } // namespace +char ScopInlinerWrapperPass::ID; + +Pass *polly::createScopInlinerWrapperPass() { + ScopInlinerWrapperPass *pass = new ScopInlinerWrapperPass(); + return pass; +} + +INITIALIZE_PASS_BEGIN( + ScopInlinerWrapperPass, "polly-scop-inliner", + "inline functions based on how much of the function is a scop.", false, + false) +INITIALIZE_PASS_END( + ScopInlinerWrapperPass, "polly-scop-inliner", + "inline functions based on how much of the function is a scop.", false, + false) polly::ScopInlinerPass::ScopInlinerPass() { if (!polly::PollyAllowFullFunction) { diff --git a/polly/lib/Transform/Simplify.cpp b/polly/lib/Transform/Simplify.cpp index cf0f8c5ca5ef2..75e91cd1c031a 100644 --- a/polly/lib/Transform/Simplify.cpp +++ b/polly/lib/Transform/Simplify.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "polly/Simplify.h" -#include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/ScopPass.h" #include "polly/Support/GICHelper.h" @@ -19,6 +18,7 @@ #include "polly/Support/ISLTools.h" #include "polly/Support/VirtualInstruction.h" #include "llvm/ADT/Statistic.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include <optional> @@ -30,11 +30,6 @@ using namespace polly; namespace { -static cl::opt<bool> - PollyPrintSimplify("polly-print-simplify", - cl::desc("Polly - Print Simplify actions"), - cl::cat(PollyCategory)); - #define TWO_STATISTICS(VARNAME, DESC) \ static llvm::Statistic VARNAME[2] = { \ {DEBUG_TYPE, #VARNAME "0", DESC " (first)"}, \ @@ -761,6 +756,39 @@ void SimplifyImpl::printScop(raw_ostream &OS, Scop &S) const { printAccesses(OS); } +class SimplifyWrapperPass final : public ScopPass { +public: + static char ID; + int CallNo; + std::optional<SimplifyImpl> Impl; + + explicit SimplifyWrapperPass(int CallNo = 0) : ScopPass(ID), CallNo(CallNo) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredTransitive<ScopInfoRegionPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.setPreservesAll(); + } + + bool runOnScop(Scop &S) override { + LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + + Impl.emplace(CallNo); + Impl->run(S, LI); + + return false; + } + + void printScop(raw_ostream &OS, Scop &S) const override { + if (Impl) + Impl->printScop(OS, S); + } + + void releaseMemory() override { Impl.reset(); } +}; + +char SimplifyWrapperPass::ID; + static llvm::PreservedAnalyses runSimplifyUsingNPM(Scop &S, ScopAnalysisManager &SAM, ScopStandardAnalysisResults &SAR, SPMUpdater &U, int CallNo, @@ -815,15 +843,58 @@ SmallVector<MemoryAccess *, 32> polly::getAccessesInOrder(ScopStmt &Stmt) { return Accesses; } -bool polly::runSimplify(Scop &S, int CallNo) { - SimplifyImpl Impl(CallNo); - Impl.run(S, S.getLI()); - if (PollyPrintSimplify) { - outs() << "Printing analysis 'Polly - Simplify' for region: '" - << S.getName() << "' in function '" << S.getFunction().getName() - << "':\n"; - Impl.printScop(outs(), S); +Pass *polly::createSimplifyWrapperPass(int CallNo) { + return new SimplifyWrapperPass(CallNo); +} + +INITIALIZE_PASS_BEGIN(SimplifyWrapperPass, "polly-simplify", "Polly - Simplify", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(SimplifyWrapperPass, "polly-simplify", "Polly - Simplify", + false, false) + +//===----------------------------------------------------------------------===// + +namespace { +/// Print result from SimplifyWrapperPass. +class SimplifyPrinterLegacyPass final : public ScopPass { +public: + static char ID; + + SimplifyPrinterLegacyPass() : SimplifyPrinterLegacyPass(outs()) {} + explicit SimplifyPrinterLegacyPass(llvm::raw_ostream &OS) + : ScopPass(ID), OS(OS) {} + + bool runOnScop(Scop &S) override { + SimplifyWrapperPass &P = getAnalysis<SimplifyWrapperPass>(); + + OS << "Printing analysis '" << P.getPassName() << "' for region: '" + << S.getRegion().getNameStr() << "' in function '" + << S.getFunction().getName() << "':\n"; + P.printScop(OS, S); + + return false; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ScopPass::getAnalysisUsage(AU); + AU.addRequired<SimplifyWrapperPass>(); + AU.setPreservesAll(); } - return Impl.isModified(); +private: + llvm::raw_ostream &OS; +}; + +char SimplifyPrinterLegacyPass::ID = 0; +} // namespace + +Pass *polly::createSimplifyPrinterLegacyPass(raw_ostream &OS) { + return new SimplifyPrinterLegacyPass(OS); } + +INITIALIZE_PASS_BEGIN(SimplifyPrinterLegacyPass, "polly-print-simplify", + "Polly - Print Simplify actions", false, false) +INITIALIZE_PASS_DEPENDENCY(SimplifyWrapperPass) +INITIALIZE_PASS_END(SimplifyPrinterLegacyPass, "polly-print-simplify", + "Polly - Print Simplify actions", false, false) diff --git a/polly/test/CodeGen/20100617.ll b/polly/test/CodeGen/20100617.ll index 7de1b843a5b0a..7229a6e3d5240 100644 --- a/polly/test/CodeGen/20100617.ll +++ b/polly/test/CodeGen/20100617.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @init_array() nounwind { diff --git a/polly/test/CodeGen/20100622.ll b/polly/test/CodeGen/20100622.ll index 13a6159d3e7a7..bed737741abba 100644 --- a/polly/test/CodeGen/20100622.ll +++ b/polly/test/CodeGen/20100622.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s | not FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s | not FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" diff --git a/polly/test/CodeGen/20100707.ll b/polly/test/CodeGen/20100707.ll index 6a4763dcb3b76..ee0422e07c4ea 100644 --- a/polly/test/CodeGen/20100707.ll +++ b/polly/test/CodeGen/20100707.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @clause_SetSplitField(i32 %Length) nounwind inlinehint { diff --git a/polly/test/CodeGen/20100707_2.ll b/polly/test/CodeGen/20100707_2.ll index 648a06479ae27..a4cd76af9dd3c 100644 --- a/polly/test/CodeGen/20100707_2.ll +++ b/polly/test/CodeGen/20100707_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @win193 = external global [4 x [36 x double]], align 32 ; <ptr> [#uses=3] diff --git a/polly/test/CodeGen/20100708.ll b/polly/test/CodeGen/20100708.ll index 52153d7cfa730..9080451aeae50 100644 --- a/polly/test/CodeGen/20100708.ll +++ b/polly/test/CodeGen/20100708.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define fastcc void @execute() nounwind { diff --git a/polly/test/CodeGen/20100708_2.ll b/polly/test/CodeGen/20100708_2.ll index 075a4947c8e72..51dc9d311f070 100644 --- a/polly/test/CodeGen/20100708_2.ll +++ b/polly/test/CodeGen/20100708_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @init_array() nounwind { diff --git a/polly/test/CodeGen/20100713.ll b/polly/test/CodeGen/20100713.ll index 0b0ed7327c8b1..a836795c9907f 100644 --- a/polly/test/CodeGen/20100713.ll +++ b/polly/test/CodeGen/20100713.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @fft_float(i32 %NumSamples) nounwind { diff --git a/polly/test/CodeGen/20100713_2.ll b/polly/test/CodeGen/20100713_2.ll index 5681f34152342..28b984bd5900f 100644 --- a/polly/test/CodeGen/20100713_2.ll +++ b/polly/test/CodeGen/20100713_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define hidden void @luaD_callhook() nounwind { diff --git a/polly/test/CodeGen/20100717.ll b/polly/test/CodeGen/20100717.ll index 97ed151410dfb..51c453cfe438e 100644 --- a/polly/test/CodeGen/20100717.ll +++ b/polly/test/CodeGen/20100717.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @matrixTranspose(ptr %A) nounwind { diff --git a/polly/test/CodeGen/20100718-DomInfo-2.ll b/polly/test/CodeGen/20100718-DomInfo-2.ll index cbee80e44949c..fdac75f1b999f 100644 --- a/polly/test/CodeGen/20100718-DomInfo-2.ll +++ b/polly/test/CodeGen/20100718-DomInfo-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @getNonAffNeighbour() nounwind { diff --git a/polly/test/CodeGen/20100718-DomInfo.ll b/polly/test/CodeGen/20100718-DomInfo.ll index e6fcaf6a9272f..da68eb0dd8fa7 100644 --- a/polly/test/CodeGen/20100718-DomInfo.ll +++ b/polly/test/CodeGen/20100718-DomInfo.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @intrapred_luma_16x16(i32 %predmode) nounwind { diff --git a/polly/test/CodeGen/20100720-MultipleConditions.ll b/polly/test/CodeGen/20100720-MultipleConditions.ll index 66c9e2bb0eb5b..3dece4efdcd06 100644 --- a/polly/test/CodeGen/20100720-MultipleConditions.ll +++ b/polly/test/CodeGen/20100720-MultipleConditions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s ;int bar1(); ;int bar2(); diff --git a/polly/test/CodeGen/20100809-IndependentBlock.ll b/polly/test/CodeGen/20100809-IndependentBlock.ll index cc3a5087090b4..f45b6544464de 100644 --- a/polly/test/CodeGen/20100809-IndependentBlock.ll +++ b/polly/test/CodeGen/20100809-IndependentBlock.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @cfft2(ptr %x) nounwind { entry: diff --git a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll index 240c2a49bc46d..82da9d2486423 100644 --- a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll +++ b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/CodeGen/20101030-Overflow.ll b/polly/test/CodeGen/20101030-Overflow.ll index c199f757ebac5..fecdb9d4fed1e 100644 --- a/polly/test/CodeGen/20101030-Overflow.ll +++ b/polly/test/CodeGen/20101030-Overflow.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @compdecomp() nounwind { diff --git a/polly/test/CodeGen/20101103-Overflow3.ll b/polly/test/CodeGen/20101103-Overflow3.ll index e8b425f009723..f1503e25fcc4c 100644 --- a/polly/test/CodeGen/20101103-Overflow3.ll +++ b/polly/test/CodeGen/20101103-Overflow3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @Reflection_coefficients(ptr %r) nounwind { bb20: diff --git a/polly/test/CodeGen/20101103-signmissmatch.ll b/polly/test/CodeGen/20101103-signmissmatch.ll index 0295ee0567208..3d0c929446f45 100644 --- a/polly/test/CodeGen/20101103-signmissmatch.ll +++ b/polly/test/CodeGen/20101103-signmissmatch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @CleanNet() nounwind { diff --git a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll index 6913deed23054..0e62e678f0ae2 100644 --- a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll +++ b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @main() nounwind { diff --git a/polly/test/CodeGen/20110226-PHI-Node-removed.ll b/polly/test/CodeGen/20110226-PHI-Node-removed.ll index a39fced9dbaba..32b018f24e547 100644 --- a/polly/test/CodeGen/20110226-PHI-Node-removed.ll +++ b/polly/test/CodeGen/20110226-PHI-Node-removed.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/CodeGen/20120316-InvalidCast.ll b/polly/test/CodeGen/20120316-InvalidCast.ll index a7f709b4a7615..b87a3dc60deaa 100644 --- a/polly/test/CodeGen/20120316-InvalidCast.ll +++ b/polly/test/CodeGen/20120316-InvalidCast.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; CHECK: polly.start diff --git a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll index 554384c0e777e..dac78bf04a250 100644 --- a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll +++ b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s ; We just check that this compilation does not crash. diff --git a/polly/test/CodeGen/20130221.ll b/polly/test/CodeGen/20130221.ll index 101930e175634..5728a768a3b3b 100644 --- a/polly/test/CodeGen/20130221.ll +++ b/polly/test/CodeGen/20130221.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" define void @list_sequence(ptr %A) { diff --git a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll index 7ad8cbf963f45..cafd68e508255 100644 --- a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll +++ b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/Intrinsics/llvm-expect.ll b/polly/test/CodeGen/Intrinsics/llvm-expect.ll index ba4ea1565e481..47fd4f07e4678 100644 --- a/polly/test/CodeGen/Intrinsics/llvm-expect.ll +++ b/polly/test/CodeGen/Intrinsics/llvm-expect.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; Check that we generate code without crashing. ; diff --git a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll index a92917f30b724..eb7de01ba862c 100644 --- a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll +++ b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll @@ -1,6 +1,6 @@ ; This test checks that we do not accidentally mutate the debug info when ; inserting loop parallel metadata. -; RUN: opt %loadNPMPolly -S -polly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel < %s | FileCheck %s +; RUN: opt %loadNPMPolly < %s -S -polly -passes=polly-codegen -polly-ast-detect-parallel | FileCheck %s ; CHECK-NOT: !7 = !{!7} target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll index 0d947004aea50..9bb086fa79aed 100644 --- a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll +++ b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s ; ; Check that we mark multiple parallel loops correctly including the memory instructions. ; diff --git a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll index 1293cd91da78d..442600cff7a0a 100644 --- a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll +++ b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=SEQUENTIAL -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=SEQUENTIAL +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; This is a trivially parallel loop. We just use it to ensure that we actually diff --git a/polly/test/CodeGen/MemAccess/bad_alignment.ll b/polly/test/CodeGen/MemAccess/bad_alignment.ll index be1c64938422c..82fff27dd0eb7 100644 --- a/polly/test/CodeGen/MemAccess/bad_alignment.ll +++ b/polly/test/CodeGen/MemAccess/bad_alignment.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -disable-output 2>&1 < %s | FileCheck %s +; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -disable-output 2>&1 < %s | FileCheck %s ; ; Check that we do not allow to access elements not accessed before because the ; alignment information would become invalid. diff --git a/polly/test/CodeGen/MemAccess/codegen_address_space.ll b/polly/test/CodeGen/MemAccess/codegen_address_space.ll index 283c8fbd2c249..3360e10529f8e 100644 --- a/polly/test/CodeGen/MemAccess/codegen_address_space.ll +++ b/polly/test/CodeGen/MemAccess/codegen_address_space.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s ;int A[100]; ; diff --git a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll index ce44f2daceaa9..0563ca87eef51 100644 --- a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll +++ b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s ;int A[100]; ; diff --git a/polly/test/CodeGen/MemAccess/codegen_simple.ll b/polly/test/CodeGen/MemAccess/codegen_simple.ll index ab1dca516a9cf..ee0187fe97d25 100644 --- a/polly/test/CodeGen/MemAccess/codegen_simple.ll +++ b/polly/test/CodeGen/MemAccess/codegen_simple.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s ;int A[100]; ; diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll index 72f9c2ce61e3c..6970565bf023e 100644 --- a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll +++ b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s ; ;float A[100]; ; diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll index a6d9969286fc7..f0896e2bf6093 100644 --- a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll +++ b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withconst -S < %s | FileCheck -check-prefix=WITHCONST %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withoutconst -S < %s | FileCheck -check-prefix=WITHOUTCONST %s +;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s +;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s ;int A[1040]; ; diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll index 568b0ff4ae20a..99fc36996f083 100644 --- a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll +++ b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withconst -S < %s | FileCheck -check-prefix=WITHCONST %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withoutconst -S < %s | FileCheck -check-prefix=WITHOUTCONST %s +;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s +;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s ; ;float A[1040]; ; diff --git a/polly/test/CodeGen/MemAccess/create_arrays.ll b/polly/test/CodeGen/MemAccess/create_arrays.ll index 8443e0f7be327..40ae8d6efa95f 100644 --- a/polly/test/CodeGen/MemAccess/create_arrays.ll +++ b/polly/test/CodeGen/MemAccess/create_arrays.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-print-scops '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadPolly -polly-print-scops -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; for (i = 0; i < _PB_NI; i++) ; for (j = 0; j < _PB_NJ; j++) diff --git a/polly/test/CodeGen/MemAccess/create_arrays_heap.ll b/polly/test/CodeGen/MemAccess/create_arrays_heap.ll index 9c95378a76433..1202d21998c94 100644 --- a/polly/test/CodeGen/MemAccess/create_arrays_heap.ll +++ b/polly/test/CodeGen/MemAccess/create_arrays_heap.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-scops '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN ; ; #define Ni 1056 ; #define Nj 1056 diff --git a/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll b/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll index f08fabd67ef5c..7d8083cc55846 100644 --- a/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll +++ b/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -basic-aa -polly-print-import-jscop -disable-output < %s | FileCheck %s ; ; Check that we allow the new access functions even though they access ; different locations than the original ones (but the alignment is the diff --git a/polly/test/CodeGen/MemAccess/different_types.ll b/polly/test/CodeGen/MemAccess/different_types.ll index ae6168d235a96..407e72702aa86 100644 --- a/polly/test/CodeGen/MemAccess/different_types.ll +++ b/polly/test/CodeGen/MemAccess/different_types.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ +; RUN: \ +; RUN: -S < %s | FileCheck %s ; ; void foo(float A[], float B[]) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/CodeGen/MemAccess/generate-all.ll b/polly/test/CodeGen/MemAccess/generate-all.ll index 099a3e0670960..7b2286bfc95a9 100644 --- a/polly/test/CodeGen/MemAccess/generate-all.ll +++ b/polly/test/CodeGen/MemAccess/generate-all.ll @@ -1,5 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-generate-expressions=false -S < %s | FileCheck %s -check-prefix=SCEV -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-generate-expressions=true -S < %s | FileCheck %s -check-prefix=ASTEXPR +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=false \ +; RUN: -S < %s | FileCheck %s -check-prefix=SCEV +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=true \ +; RUN: -S < %s | FileCheck %s -check-prefix=ASTEXPR ; ; void foo(float A[]) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll index d8d0df7009685..5c926ac638413 100644 --- a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll +++ b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-invariant-load-hoisting -S 2>&1 < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ +; RUN: -polly-invariant-load-hoisting -S \ +; RUN: 2>&1 < %s | FileCheck %s ; Setting new access functions where the base pointer of the array that is newly ; accessed is only loaded within the scop itself caused incorrect code to be diff --git a/polly/test/CodeGen/MemAccess/map_scalar_access.ll b/polly/test/CodeGen/MemAccess/map_scalar_access.ll index 4ea21b26ce531..7c845d4a004f4 100644 --- a/polly/test/CodeGen/MemAccess/map_scalar_access.ll +++ b/polly/test/CodeGen/MemAccess/map_scalar_access.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed -polly-print-import-jscop -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed -polly-import-jscop -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN define void @map_scalar_access(ptr noalias nonnull %A) { entry: diff --git a/polly/test/CodeGen/MemAccess/multiple_types.ll b/polly/test/CodeGen/MemAccess/multiple_types.ll index edc3888be364b..7848977ce0310 100644 --- a/polly/test/CodeGen/MemAccess/multiple_types.ll +++ b/polly/test/CodeGen/MemAccess/multiple_types.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-allow-differing-element-types -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' \ +; RUN: -polly-allow-differing-element-types \ +; RUN: -S < %s | FileCheck %s ; ; // Check that accessing one array with different types works. ; void multiple_types(char *Short, char *Float, char *Double) { diff --git a/polly/test/CodeGen/MemAccess/simple.ll b/polly/test/CodeGen/MemAccess/simple.ll index 63d66f1c925f7..5077e1a1b5a2c 100644 --- a/polly/test/CodeGen/MemAccess/simple.ll +++ b/polly/test/CodeGen/MemAccess/simple.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -stats < %s 2>&1 | FileCheck %s +;RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -stats < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ;int A[100]; diff --git a/polly/test/CodeGen/MemAccess/simple_analyze.ll b/polly/test/CodeGen/MemAccess/simple_analyze.ll index f07cb1629ca18..143651b565aff 100644 --- a/polly/test/CodeGen/MemAccess/simple_analyze.ll +++ b/polly/test/CodeGen/MemAccess/simple_analyze.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s +;RUN: opt %loadPolly -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" @A = common global [100 x i32] zeroinitializer, align 4 diff --git a/polly/test/CodeGen/MemAccess/update_access_functions.ll b/polly/test/CodeGen/MemAccess/update_access_functions.ll index 93f5f186ad6a5..51fa97adb3c37 100644 --- a/polly/test/CodeGen/MemAccess/update_access_functions.ll +++ b/polly/test/CodeGen/MemAccess/update_access_functions.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ +; RUN: -polly-import-jscop-postfix=transformed \ +; RUN: < %s -S | FileCheck %s ; CHECK-LABEL: polly.stmt.loop1: ; CHECK-NEXT: %3 = mul nsw i64 5, %polly.indvar{{[0-9]*}} diff --git a/polly/test/CodeGen/Metadata/basic_vec_annotate.ll b/polly/test/CodeGen/Metadata/basic_vec_annotate.ll index 344a6d0990837..ebe91636ea3cc 100644 --- a/polly/test/CodeGen/Metadata/basic_vec_annotate.ll +++ b/polly/test/CodeGen/Metadata/basic_vec_annotate.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-annotate-metadata-vectorize < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-annotate-metadata-vectorize < %s | FileCheck %s ; Basic verification of vectorize metadata getting added when "-polly-vectorize-metadata" is ; passed. diff --git a/polly/test/CodeGen/OpenMP/alias-metadata.ll b/polly/test/CodeGen/OpenMP/alias-metadata.ll index 541fbdda5a6b9..121f630789892 100644 --- a/polly/test/CodeGen/OpenMP/alias-metadata.ll +++ b/polly/test/CodeGen/OpenMP/alias-metadata.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s ; ; void foo(float *A, float *B) { ; for (long i = 0; i < 1000; i++) diff --git a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll index 657921690c74d..7177ae01f0754 100644 --- a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll +++ b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s ; ; Check that we do not crash but generate parallel code ; diff --git a/polly/test/CodeGen/OpenMP/inlineasm.ll b/polly/test/CodeGen/OpenMP/inlineasm.ll index ac6c7070c1abf..82a73780886e3 100644 --- a/polly/test/CodeGen/OpenMP/inlineasm.ll +++ b/polly/test/CodeGen/OpenMP/inlineasm.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>' -polly-parallel -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,polly-codegen' -polly-parallel -S < %s | FileCheck %s ; llvm.org/PR51960 ; CHECK-LABEL: define internal void @foo_polly_subfn diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll index 08c0cc7fe37f2..aba3ae78f7783 100644 --- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll +++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \ +; RUN: -polly-parallel-force -S < %s | FileCheck %s ; ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction. ; diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll index 8246aaa25b7b2..8cf6148a7b44c 100644 --- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll +++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \ +; RUN: -polly-parallel-force -S < %s | FileCheck %s ; ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction. ; diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll index 0c5208c77768b..823e5cab55ab3 100644 --- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll +++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \ +; RUN: -polly-parallel-force -S < %s | FileCheck %s ; ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction but ; not B[0] as it is not needed diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll index fd039e75444b5..5557839e715ed 100644 --- a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll +++ b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \ +; RUN: -polly-parallel-force -S < %s | FileCheck %s ; ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction. ; diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll index fe8b8a3a022bc..a987fac31b743 100644 --- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll +++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; This code has failed the scev based code generation as the scev in the scop ; contains an AddRecExpr of an outer loop. When generating code, we did not diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll index d1f48d92e0e75..96c6d900a7a00 100644 --- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll +++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; AST: #pragma simd ; AST: #pragma omp parallel for diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll index 5b032801c7282..c4ad665c7b6cf 100644 --- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll +++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; The interesting part of this test case is the instruction: ; %tmp = bitcast i8* %call to i64** diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll index d612faf7b67c5..82acba8b3c523 100644 --- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll +++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR ; Make sure we correctly forward the reference to 'A' to the OpenMP subfunction. ; diff --git a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll index 213cc2635fb6d..aa44658131bba 100644 --- a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll +++ b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR ; ; float A[100]; ; diff --git a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll index fef23f141eaeb..4deab1af0ccf0 100644 --- a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll +++ b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-parallel '-passes=polly<no-default-opts;delicm>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-parallel '-passes=polly-delicm,polly-codegen' -S < %s | FileCheck %s ; ; Verify that -polly-parallel can handle mapped scalar MemoryAccesses. ; diff --git a/polly/test/CodeGen/OpenMP/matmul-parallel.ll b/polly/test/CodeGen/OpenMP/matmul-parallel.ll index fd8ce87b45ae8..43326b29f7ef1 100644 --- a/polly/test/CodeGen/OpenMP/matmul-parallel.ll +++ b/polly/test/CodeGen/OpenMP/matmul-parallel.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s -; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly<no-default-opts;opt-isl>' -S < %s | FileCheck --check-prefix=CODEGEN %s +; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,print<polly-ast>' -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s +; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,polly-codegen' -S < %s | FileCheck --check-prefix=CODEGEN %s ; REQUIRES: asserts ; Parallelization of detected matrix-multiplication. diff --git a/polly/test/CodeGen/OpenMP/new_multidim_access.ll b/polly/test/CodeGen/OpenMP/new_multidim_access.ll index 8018acdcb0e6a..5faabb4d20c1a 100644 --- a/polly/test/CodeGen/OpenMP/new_multidim_access.ll +++ b/polly/test/CodeGen/OpenMP/new_multidim_access.ll @@ -1,6 +1,10 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-import-jscop \ +; RUN: -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S -polly-parallel < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadPolly -polly-import-jscop \ +; RUN: -polly-codegen -S < %s \ +; RUN: -polly-parallel \ +; RUN: | FileCheck %s -check-prefix=IR ; void new_multidim_access(long n, long m, float A[][m]) { ; for (long i = 0; i < n; i++) diff --git a/polly/test/CodeGen/OpenMP/recomputed-srem.ll b/polly/test/CodeGen/OpenMP/recomputed-srem.ll index 99069612cd1d4..b7b3a44610f32 100644 --- a/polly/test/CodeGen/OpenMP/recomputed-srem.ll +++ b/polly/test/CodeGen/OpenMP/recomputed-srem.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-parallel -polly-parallel-force -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -polly-parallel \ +; RUN: -polly-parallel-force -S < %s | FileCheck %s ; ; Test to verify that we pass %rem96 to the parallel subfunction. ; diff --git a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll index 236362a3e23dc..c207f589e4da0 100644 --- a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll +++ b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll @@ -1,8 +1,17 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR - -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-scheduling=runtime -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR - -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR +; RUN: opt %loadNPMPolly -polly-parallel \ +; RUN: -polly-parallel-force -passes=polly-codegen \ +; RUN: -S -verify-dom-info < %s \ +; RUN: | FileCheck %s -check-prefix=IR + +; RUN: opt %loadNPMPolly -polly-parallel \ +; RUN: -polly-parallel-force -passes=polly-codegen -polly-scheduling=runtime \ +; RUN: -S -verify-dom-info < %s \ +; RUN: | FileCheck %s -check-prefix=IR + +; RUN: opt %loadNPMPolly -polly-parallel \ +; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \ +; RUN: -S -verify-dom-info < %s \ +; RUN: | FileCheck %s -check-prefix=LIBOMP-IR ; IR: @GOMP_parallel_loop_runtime_start diff --git a/polly/test/CodeGen/OpenMP/reference-other-bb.ll b/polly/test/CodeGen/OpenMP/reference-other-bb.ll index 9925187883173..dbfbd9a905086 100644 --- a/polly/test/CodeGen/OpenMP/reference-other-bb.ll +++ b/polly/test/CodeGen/OpenMP/reference-other-bb.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; IR: @foo_polly_subfn target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll index 3738266b558ed..ee43b8aa34a44 100644 --- a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll +++ b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; - Test the case where scalar evolution references a loop that is outside diff --git a/polly/test/CodeGen/OpenMP/reference_latest.ll b/polly/test/CodeGen/OpenMP/reference_latest.ll index fb420b06b9afb..7a8cd77bb1571 100644 --- a/polly/test/CodeGen/OpenMP/reference_latest.ll +++ b/polly/test/CodeGen/OpenMP/reference_latest.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;delicm;simplify>' -polly-parallel -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-codegen' -polly-parallel -S < %s | FileCheck %s ; ; Test that parallel codegen handles scalars mapped to other arrays. ; After mapping "store double %add10" references the array "MemRef2". diff --git a/polly/test/CodeGen/OpenMP/scev-rewriting.ll b/polly/test/CodeGen/OpenMP/scev-rewriting.ll index 861a78e4acd7a..9b79f29094482 100644 --- a/polly/test/CodeGen/OpenMP/scev-rewriting.ll +++ b/polly/test/CodeGen/OpenMP/scev-rewriting.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly < %s -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable -passes=polly-codegen -S | FileCheck %s ; CHECK: define internal void @DoStringSort_polly_subfn target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnueabi" diff --git a/polly/test/CodeGen/OpenMP/single_loop.ll b/polly/test/CodeGen/OpenMP/single_loop.ll index 5e8a58fadd56c..e5aee840ade74 100644 --- a/polly/test/CodeGen/OpenMP/single_loop.ll +++ b/polly/test/CodeGen/OpenMP/single_loop.ll @@ -1,14 +1,14 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4 -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s -check-prefix=IR-STRIDE4 +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4 +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -S < %s | FileCheck %s -check-prefix=IR-STRIDE4 -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;codegen>' -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4 +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4 ; This extensive test case tests the creation of the full set of OpenMP calls ; as well as the subfunction creation using a trivial loop as example. diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll index 95324793f4fa4..c519bfdee7a58 100644 --- a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll +++ b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; #define N 1024 ; float A[N]; diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll index 7334762f84f6c..f6dfd62d6bcc1 100644 --- a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll +++ b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll @@ -1,8 +1,18 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel \ +; RUN: -polly-parallel-force -passes=polly-codegen \ +; RUN: -S -verify-dom-info < %s \ +; RUN: | FileCheck %s -check-prefix=IR -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR +; RUN: opt %loadNPMPolly -polly-parallel \ +; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \ +; RUN: -S -verify-dom-info < %s \ +; RUN: | FileCheck %s -check-prefix=LIBOMP-IR -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-STATIC-IR +; RUN: opt %loadNPMPolly -polly-parallel \ +; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \ +; RUN: -polly-scheduling=static \ +; RUN: -S -verify-dom-info < %s \ +; RUN: | FileCheck %s -check-prefix=LIBOMP-STATIC-IR ; Ensure the scalars are initialized before the OpenMP code is launched. ; diff --git a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll index 77c1b23a3f76c..934e04461f134 100644 --- a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll +++ b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR ; This test case verifies that we create correct code even if two OpenMP loops ; share common outer variables. diff --git a/polly/test/CodeGen/PHIInExit.ll b/polly/test/CodeGen/PHIInExit.ll index 39bdac793e8a1..3e0c9d67d5ca8 100644 --- a/polly/test/CodeGen/PHIInExit.ll +++ b/polly/test/CodeGen/PHIInExit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" %struct..0__pthread_mutex_s = type { i32, i32, i32, i32, i32, i32, %struct.__pthread_list_t } diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll index 9ec9804d35b0d..ccb0d15cfc3d2 100644 --- a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll +++ b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-codegen-add-debug-printing -polly-ignore-aliasing < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ +; RUN: -polly-codegen-add-debug-printing \ +; RUN: -polly-ignore-aliasing < %s | FileCheck %s ; #define N 10 ; void foo(float A[restrict], double B[restrict], char C[restrict], diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll index 736c136eeb67c..4ffb7fd6e4621 100644 --- a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll +++ b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars -passes=polly-codegen -S < %s | FileCheck %s ; define void @func(i32 %n, ptr %A) { diff --git a/polly/test/CodeGen/alias-check-multi-dim.ll b/polly/test/CodeGen/alias-check-multi-dim.ll index bab2690bddb17..0440bda74b391 100644 --- a/polly/test/CodeGen/alias-check-multi-dim.ll +++ b/polly/test/CodeGen/alias-check-multi-dim.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen \ +; RUN: -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: sext i32 %indvar.init to i64 diff --git a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll index 37ec2d5b748af..4186b8521a535 100644 --- a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll +++ b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ignore-aliasing -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -S < %s \ +; RUN: | FileCheck %s ; ; void manyarrays(float A1[], float A2[], float A3[], float A4[], float A5[], ; float A6[], float A7[], float A8[], float A9[]) { diff --git a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll index 7fed270cb51dd..8e1fc3b328355 100644 --- a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll +++ b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; We have to cast %B to "short *" before we create RTCs. ; diff --git a/polly/test/CodeGen/aliasing_different_pointer_types.ll b/polly/test/CodeGen/aliasing_different_pointer_types.ll index 5326af339ddac..e601c22b978da 100644 --- a/polly/test/CodeGen/aliasing_different_pointer_types.ll +++ b/polly/test/CodeGen/aliasing_different_pointer_types.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; Check that we cast the different pointer types correctly before we compare ; them in the RTC's. We use i8* as max pointer type. diff --git a/polly/test/CodeGen/aliasing_multidimensional_access.ll b/polly/test/CodeGen/aliasing_multidimensional_access.ll index 5d0b40d6b59aa..e1dae03280a0e 100644 --- a/polly/test/CodeGen/aliasing_multidimensional_access.ll +++ b/polly/test/CodeGen/aliasing_multidimensional_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; Check that we calculate the maximal access into array A correctly and track the overflow state. ; diff --git a/polly/test/CodeGen/aliasing_parametric_simple_1.ll b/polly/test/CodeGen/aliasing_parametric_simple_1.ll index 1b7b85835d795..a79ba2532535d 100644 --- a/polly/test/CodeGen/aliasing_parametric_simple_1.ll +++ b/polly/test/CodeGen/aliasing_parametric_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; void jd(int *A, int *B, int c) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/CodeGen/aliasing_parametric_simple_2.ll b/polly/test/CodeGen/aliasing_parametric_simple_2.ll index fa8053ccabbea..efe4af1c9e7c5 100644 --- a/polly/test/CodeGen/aliasing_parametric_simple_2.ll +++ b/polly/test/CodeGen/aliasing_parametric_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; void jd(int *A, int *B, int c) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/CodeGen/aliasing_struct_element.ll b/polly/test/CodeGen/aliasing_struct_element.ll index 4e8570944f6c6..3079e58d7daba 100644 --- a/polly/test/CodeGen/aliasing_struct_element.ll +++ b/polly/test/CodeGen/aliasing_struct_element.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; We should only access (or compute the address of) "the first element" of %S ; as it is a single struct not a struct array. The maximal access to S, thus diff --git a/polly/test/CodeGen/alignment.ll b/polly/test/CodeGen/alignment.ll index daf7999c8072b..e0f6a959476f6 100644 --- a/polly/test/CodeGen/alignment.ll +++ b/polly/test/CodeGen/alignment.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; Check that the special alignment information is kept ; diff --git a/polly/test/CodeGen/annotated_alias_scopes.ll b/polly/test/CodeGen/annotated_alias_scopes.ll index 7d2d9038270a9..ada03e0663722 100644 --- a/polly/test/CodeGen/annotated_alias_scopes.ll +++ b/polly/test/CodeGen/annotated_alias_scopes.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=SCOPES +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=SCOPES ; ; Check that we create alias scopes that indicate the accesses to A, B and C cannot alias in any way. ; diff --git a/polly/test/CodeGen/blas_sscal_simplified.ll b/polly/test/CodeGen/blas_sscal_simplified.ll index 461af09b5b289..99f2eae9dd8e5 100644 --- a/polly/test/CodeGen/blas_sscal_simplified.ll +++ b/polly/test/CodeGen/blas_sscal_simplified.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s ; ; Regression test for a bug in the runtime check generation. diff --git a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll index 5eb6076892f3e..5dba93373b70b 100644 --- a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll +++ b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -disable-output < %s ; ; CHECK: store i32 %tmp14_p_scalar_, ptr %tmp14.s2a ; CHECK: %tmp14.final_reload = load i32, ptr %tmp14.s2a diff --git a/polly/test/CodeGen/constant_condition.ll b/polly/test/CodeGen/constant_condition.ll index 9d3c5a811b16a..905aa52df5080 100644 --- a/polly/test/CodeGen/constant_condition.ll +++ b/polly/test/CodeGen/constant_condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s +;RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -disable-output < %s 2>&1 | FileCheck %s ;#include <string.h> ;int A[1]; diff --git a/polly/test/CodeGen/create-conditional-scop.ll b/polly/test/CodeGen/create-conditional-scop.ll index d4df48b757d3d..b8c9a81b71a91 100644 --- a/polly/test/CodeGen/create-conditional-scop.ll +++ b/polly/test/CodeGen/create-conditional-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -verify-loop-info -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -verify-loop-info < %s -S | FileCheck %s target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32" diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll index 31b5e69ae4c6a..dfef4202391d4 100644 --- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll +++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s ; ; Check we do not crash even though the dead %tmp8 is referenced by a parameter ; and we do not pre-load it (as it is dead). diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll index 88b844bea5e4e..fcc6764ce9c21 100644 --- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll +++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s ; ; Check we do not crash even though there is a dead load that is referenced by ; a parameter and we do not pre-load it (as it is dead). diff --git a/polly/test/CodeGen/debug-intrinsics.ll b/polly/test/CodeGen/debug-intrinsics.ll index f397a4b83d88a..ed4b81a8e3a3c 100644 --- a/polly/test/CodeGen/debug-intrinsics.ll +++ b/polly/test/CodeGen/debug-intrinsics.ll @@ -1,6 +1,10 @@ -; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly \ +; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen -S < %s | \ +; RUN: FileCheck %s -; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly \ +; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen -S < %s | \ +; RUN: FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll index 7f6f128c2cff2..edc03333a358d 100644 --- a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll +++ b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s ; ; This caused dominance problems at some point as we do bail out during ; code generation. Just verify it runs through. diff --git a/polly/test/CodeGen/empty_domain_in_context.ll b/polly/test/CodeGen/empty_domain_in_context.ll index f6c39eb0517bc..a2fe805f402e0 100644 --- a/polly/test/CodeGen/empty_domain_in_context.ll +++ b/polly/test/CodeGen/empty_domain_in_context.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree;opt-isl;codegen>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-optree,polly-opt-isl,polly-codegen' -S < %s | FileCheck %s ; ; llvm.org/PR35362 ; isl codegen does not allow to generate isl_ast_expr from pw_aff which have an diff --git a/polly/test/CodeGen/entry_with_trivial_phi.ll b/polly/test/CodeGen/entry_with_trivial_phi.ll index 09570938a9ca1..f2c9da04d6495 100644 --- a/polly/test/CodeGen/entry_with_trivial_phi.ll +++ b/polly/test/CodeGen/entry_with_trivial_phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s ; ; The entry of this scop's simple region (entry.split => for.end) has an trivial ; PHI node. LCSSA may create such PHI nodes. This is a breakdown of this case in diff --git a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll index 7d8ef7acf9435..2f1ec1a7872aa 100644 --- a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll +++ b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; The entry of this scop's simple region (entry.split => for.end) has an trivial ; PHI node that is used in a different of the scop region. LCSSA may create such diff --git a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll index c5c11c8ea2f8f..63b6becd19574 100644 --- a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll +++ b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; XFAIL: * ; ; CHECK-LABEL: polly.stmt.if.then: diff --git a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll index 1e38210c733d9..abec28894f45b 100644 --- a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll +++ b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/exprModDiv.ll b/polly/test/CodeGen/exprModDiv.ll index b123e90c07882..c9b419abe3242 100644 --- a/polly/test/CodeGen/exprModDiv.ll +++ b/polly/test/CodeGen/exprModDiv.ll @@ -1,5 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=pow2 -S < %s | FileCheck %s -check-prefix=POW2 +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ +; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ +; RUN: -polly-import-jscop-postfix=pow2 \ +; RUN: -S < %s | FileCheck %s -check-prefix=POW2 ; ; void exprModDiv(float *A, float *B, float *C, long N, long p) { ; for (long i = 0; i < N; i++) diff --git a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll index c7873baeeaeb7..1ca2413fd5e19 100644 --- a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll +++ b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll @@ -1,5 +1,7 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=false < %s | FileCheck %s -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen \ +; RUN: -polly-invariant-load-hoisting=false < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen \ +; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; Check that we generate valid code even if the load of cont_STACKPOINTER is ; hoisted in one SCoP and used (through the phi node %tmp2). diff --git a/polly/test/CodeGen/hoisting_1.ll b/polly/test/CodeGen/hoisting_1.ll index 31ae969cd3156..aa29bfd7dbcbc 100644 --- a/polly/test/CodeGen/hoisting_1.ll +++ b/polly/test/CodeGen/hoisting_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -polly-allow-differing-element-types -disable-output %s +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/hoisting_2.ll b/polly/test/CodeGen/hoisting_2.ll index eb6f7ae5ff6d1..1b913f2cb07be 100644 --- a/polly/test/CodeGen/hoisting_2.ll +++ b/polly/test/CodeGen/hoisting_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -polly-allow-differing-element-types -disable-output %s +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/inner_scev_sdiv_1.ll b/polly/test/CodeGen/inner_scev_sdiv_1.ll index f7595a6afb0be..d210105c46baf 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_1.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s ; ; Excerpt from the test-suite's oggenc reduced using bugpoint. ; diff --git a/polly/test/CodeGen/inner_scev_sdiv_2.ll b/polly/test/CodeGen/inner_scev_sdiv_2.ll index 247c102834b25..33233fe2fdf17 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_2.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; The SCEV expression in this test case refers to a sequence of sdiv ; instructions, which are part of different bbs in the SCoP. When code diff --git a/polly/test/CodeGen/inner_scev_sdiv_3.ll b/polly/test/CodeGen/inner_scev_sdiv_3.ll index fc1cce41c0f4e..a8c626347efe9 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_3.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; This test case has a inner SCEV sdiv that will escape the SCoP. Just check we ; do not crash and generate valid code. diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll index 1ff598a4a021a..31c14e85f253e 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN ; ; CHECK: [N] -> { Stmt_bb11[i0, i1] : i0 < N and i1 >= 0 and 3i1 <= -3 + i0 }; ; CODEGEN: polly diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll index 4cd146ddbf62e..b42371b0891e6 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen \ +; RUN: < %s | FileCheck %s ; ; Check that this will not crash our code generation. ; diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll index 586875bbefcbe..45af63402c986 100644 --- a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll +++ b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen \ +; RUN: -S < %s | FileCheck %s ; ; This will just check that we generate valid code here. ; diff --git a/polly/test/CodeGen/intrinsics_lifetime.ll b/polly/test/CodeGen/intrinsics_lifetime.ll index 0f35664eb7e1c..a708548798ebb 100644 --- a/polly/test/CodeGen/intrinsics_lifetime.ll +++ b/polly/test/CodeGen/intrinsics_lifetime.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s ; ; Verify that we remove the lifetime markers from everywhere. ; diff --git a/polly/test/CodeGen/intrinsics_misc.ll b/polly/test/CodeGen/intrinsics_misc.ll index 4a64c1a641182..a643b8accd4e9 100644 --- a/polly/test/CodeGen/intrinsics_misc.ll +++ b/polly/test/CodeGen/intrinsics_misc.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s ; ; Verify that we remove the misc intrinsics from the optimized SCoP. ; diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll index 15fe0d9e22416..e7cbf748bea73 100644 --- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll +++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ +; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; This crashed our codegen at some point, verify it runs through ; diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll index c1ab026e97701..24e9240c234d1 100644 --- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll +++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ +; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; This crashed our codegen at some point, verify it runs through ; diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll index f0c833ce1bce1..d1d861e316ee4 100644 --- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll +++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ +; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; This crashed our codegen at some point, verify it runs through ; diff --git a/polly/test/CodeGen/invariant-load-dimension.ll b/polly/test/CodeGen/invariant-load-dimension.ll index 13576b9f40455..21e53055c56b0 100644 --- a/polly/test/CodeGen/invariant-load-dimension.ll +++ b/polly/test/CodeGen/invariant-load-dimension.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-invariant-load-hoisting '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-process-unprofitable -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-invariant-load-hoisting '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS +; RUN: opt %loadNPMPolly -S < %s -passes=polly-codegen -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" diff --git a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll index d92d97012b33c..1fd9cb81771c6 100644 --- a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll +++ b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s ; ; Check that we generate valid code as we did non preload the base pointer ; origin of %tmp4 at some point. diff --git a/polly/test/CodeGen/invariant_cannot_handle_void.ll b/polly/test/CodeGen/invariant_cannot_handle_void.ll index f6dcac08dffca..420cb608f9ba4 100644 --- a/polly/test/CodeGen/invariant_cannot_handle_void.ll +++ b/polly/test/CodeGen/invariant_cannot_handle_void.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s ; ; The offset of the %tmp1 load wrt. to %buff (62 bytes) is not divisible ; by the type size (i32 = 4 bytes), thus we will have to represent %buff diff --git a/polly/test/CodeGen/invariant_load.ll b/polly/test/CodeGen/invariant_load.ll index c89da73efc839..2d5e6042ea6a4 100644 --- a/polly/test/CodeGen/invariant_load.ll +++ b/polly/test/CodeGen/invariant_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK-NEXT: %polly.access.B = getelementptr i32, ptr %B, i64 0 diff --git a/polly/test/CodeGen/invariant_load_address_space.ll b/polly/test/CodeGen/invariant_load_address_space.ll index 7d5139cc55f88..3d1958e5b8a43 100644 --- a/polly/test/CodeGen/invariant_load_address_space.ll +++ b/polly/test/CodeGen/invariant_load_address_space.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK-NEXT: %polly.access.B = getelementptr i32, ptr addrspace(1) %B, i64 0 diff --git a/polly/test/CodeGen/invariant_load_alias_metadata.ll b/polly/test/CodeGen/invariant_load_alias_metadata.ll index 2a704ee9c576a..252463384a5c8 100644 --- a/polly/test/CodeGen/invariant_load_alias_metadata.ll +++ b/polly/test/CodeGen/invariant_load_alias_metadata.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true \ +; RUN: -S < %s | FileCheck %s ; ; This test case checks whether Polly generates alias metadata in case of ; the ublas gemm kernel and polly-invariant-load-hoisting. diff --git a/polly/test/CodeGen/invariant_load_base_pointer.ll b/polly/test/CodeGen/invariant_load_base_pointer.ll index f6b873994036c..d4ac433475f05 100644 --- a/polly/test/CodeGen/invariant_load_base_pointer.ll +++ b/polly/test/CodeGen/invariant_load_base_pointer.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK-NEXT: %polly.access.BPLoc = getelementptr ptr, ptr %BPLoc, i64 0 diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll index 4dbcc3b3b049d..06a9a93363ed9 100644 --- a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll +++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK-NEXT: %0 = sext i32 %N to i64 diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll index 39520c8fd8217..66ab9a31b1032 100644 --- a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll +++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA ; ; As (p + q) can overflow we have to check that we load from ; I[p + q] only if it does not. diff --git a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll index 414ca127a251f..fa904e9b96d34 100644 --- a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll +++ b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: | FileCheck %s ; CHECK: %polly.access.A = getelementptr ptr, ptr %A, i64 0 ; CHECK: %polly.access.A.load = load ptr, ptr %polly.access.A diff --git a/polly/test/CodeGen/invariant_load_condition.ll b/polly/test/CodeGen/invariant_load_condition.ll index f0782c023378b..36e588329d669 100644 --- a/polly/test/CodeGen/invariant_load_condition.ll +++ b/polly/test/CodeGen/invariant_load_condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK-NEXT: %polly.access.C = getelementptr i32, ptr %C, i64 0 diff --git a/polly/test/CodeGen/invariant_load_different_sized_types.ll b/polly/test/CodeGen/invariant_load_different_sized_types.ll index 034c3587a0708..0a88bb70966d2 100644 --- a/polly/test/CodeGen/invariant_load_different_sized_types.ll +++ b/polly/test/CodeGen/invariant_load_different_sized_types.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S -polly-allow-differing-element-types < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S \ +; RUN: -polly-allow-differing-element-types < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/invariant_load_escaping.ll b/polly/test/CodeGen/invariant_load_escaping.ll index 85578d3ba0992..416148b72303b 100644 --- a/polly/test/CodeGen/invariant_load_escaping.ll +++ b/polly/test/CodeGen/invariant_load_escaping.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; int f(int *A, int *B) { ; // Possible aliasing between A and B but if not then *B would be diff --git a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll index ff6e9a8e3ddae..906bfc1805d39 100644 --- a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll +++ b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s ; ; void fence(void); ; diff --git a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll index edd38cab2afba..ab02e639f0d2a 100644 --- a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll +++ b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; This crashed at some point as the invariant load is in a non-affine ; subregion. Just check it does not anymore. diff --git a/polly/test/CodeGen/invariant_load_loop_ub.ll b/polly/test/CodeGen/invariant_load_loop_ub.ll index 923102440c547..1db27ad8e58ba 100644 --- a/polly/test/CodeGen/invariant_load_loop_ub.ll +++ b/polly/test/CodeGen/invariant_load_loop_ub.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s ; ; CHECK: polly.start ; diff --git a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll index 0e381b863fb8b..5a11adcdebbc5 100644 --- a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll +++ b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s ; ; Check that this does not crash as the invariant load is not executed (thus ; not preloaded) but still referenced by one of the parameters. diff --git a/polly/test/CodeGen/invariant_load_outermost.ll b/polly/test/CodeGen/invariant_load_outermost.ll index bbbe1f1663964..7e0550fb3be94 100644 --- a/polly/test/CodeGen/invariant_load_outermost.ll +++ b/polly/test/CodeGen/invariant_load_outermost.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; CHECK: polly.start diff --git a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll index 9fe343f752d14..abf957b556daa 100644 --- a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll +++ b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; SCOP: Assumed Context: ; SCOP-NEXT: [p_0, tmp4] -> { : } diff --git a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll index dc1c2bca4b6e3..b565f1bd5096a 100644 --- a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll +++ b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK: %polly.access.A = getelementptr ptr, ptr %A, i64 42 diff --git a/polly/test/CodeGen/invariant_load_scalar_dep.ll b/polly/test/CodeGen/invariant_load_scalar_dep.ll index bb60c50b1ab40..ba2999e27984d 100644 --- a/polly/test/CodeGen/invariant_load_scalar_dep.ll +++ b/polly/test/CodeGen/invariant_load_scalar_dep.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s ; ; CHECK-LABEL: polly.preload.begin: ; CHECK: %polly.access.B = getelementptr i32, ptr %B, i64 0 diff --git a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll index 87c407e05b972..26c964c9c6a72 100644 --- a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll +++ b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s ; ; Verify the preloaded %tmp0 is stored and communicated in the same alloca. ; In this case, we do not reload %ncol.load from the scalar stack slot, but diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll index 5e2b28c53019e..6bf11d5697bd7 100644 --- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll +++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s ; ; Check we do not crash even though we pre-load values with different types ; from the same base pointer. diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll index 20d9f6d40b7d6..07ce941522459 100644 --- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll +++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s ; ; Check we do not crash even though we pre-load values with different types ; from the same base pointer. diff --git a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll index 51f8a55d1a400..19b30afd33ba7 100644 --- a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll +++ b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting -polly-ignore-parameter-bounds -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting \ +; RUN: -polly-ignore-parameter-bounds -S < %s | FileCheck %s ; CHECK: polly.preload.begin: ; CHECK-NEXT: %global.load = load i32, ptr @global, align 4, !alias.scope !0, !noalias !3 diff --git a/polly/test/CodeGen/invariant_verify_function_failed.ll b/polly/test/CodeGen/invariant_verify_function_failed.ll index 432c155fdd3ae..1dcc175ebb163 100644 --- a/polly/test/CodeGen/invariant_verify_function_failed.ll +++ b/polly/test/CodeGen/invariant_verify_function_failed.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-print-detect -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; This crashed at some point as the pointer returned by the call ; to @__errno_location is invariant and defined in the SCoP but not diff --git a/polly/test/CodeGen/invariant_verify_function_failed_2.ll b/polly/test/CodeGen/invariant_verify_function_failed_2.ll index 65ba2cd993193..43b3d99e11a2f 100644 --- a/polly/test/CodeGen/invariant_verify_function_failed_2.ll +++ b/polly/test/CodeGen/invariant_verify_function_failed_2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true %s | FileCheck %s +; RUN: opt %loadNPMPolly -S '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS +; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s ; ; Check we generate valid code. diff --git a/polly/test/CodeGen/issue56692.ll b/polly/test/CodeGen/issue56692.ll index 5e225d73bdcd3..34c4e398e2ac0 100644 --- a/polly/test/CodeGen/issue56692.ll +++ b/polly/test/CodeGen/issue56692.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify -passes=polly-codegen -S < %s | FileCheck %s ; https://github.com/llvm/llvm-project/issues/56692 ; ; CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}), !dbg ![[OPTLOC:[0-9]+]] diff --git a/polly/test/CodeGen/large-numbers-in-boundary-context.ll b/polly/test/CodeGen/large-numbers-in-boundary-context.ll index 4d55273618df6..b228baf9bdf22 100644 --- a/polly/test/CodeGen/large-numbers-in-boundary-context.ll +++ b/polly/test/CodeGen/large-numbers-in-boundary-context.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; XFAIL: * ; ; The boundary context contains a constant that does not fit in 64 bits. Hence, diff --git a/polly/test/CodeGen/load_subset_with_context.ll b/polly/test/CodeGen/load_subset_with_context.ll index 33b3d3b72225f..ccd4198b9fe85 100644 --- a/polly/test/CodeGen/load_subset_with_context.ll +++ b/polly/test/CodeGen/load_subset_with_context.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; A load must provide a value for every statement instance. ; Statement instances not in the SCoP's context are irrelevant. diff --git a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll index dc0c5517d7ca5..f43247b3e5057 100644 --- a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll +++ b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/CodeGen/loop_with_condition.ll b/polly/test/CodeGen/loop_with_condition.ll index cf28a4de63f3b..49e312404cca8 100644 --- a/polly/test/CodeGen/loop_with_condition.ll +++ b/polly/test/CodeGen/loop_with_condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ;#include <string.h> ;#define N 1024 diff --git a/polly/test/CodeGen/loop_with_condition_2.ll b/polly/test/CodeGen/loop_with_condition_2.ll index 1d8a8132a79cb..8ae38eeeb4982 100644 --- a/polly/test/CodeGen/loop_with_condition_2.ll +++ b/polly/test/CodeGen/loop_with_condition_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; Verify that we actually detect this loop as the innermost loop even though ; there is a conditional inside. diff --git a/polly/test/CodeGen/loop_with_condition_ineq.ll b/polly/test/CodeGen/loop_with_condition_ineq.ll index c222f67ed7836..64019a6090212 100644 --- a/polly/test/CodeGen/loop_with_condition_ineq.ll +++ b/polly/test/CodeGen/loop_with_condition_ineq.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ;#include <string.h> ;#define N 1024 diff --git a/polly/test/CodeGen/loop_with_condition_nested.ll b/polly/test/CodeGen/loop_with_condition_nested.ll index 32256a7344664..5dcb51dcb91cd 100644 --- a/polly/test/CodeGen/loop_with_condition_nested.ll +++ b/polly/test/CodeGen/loop_with_condition_nested.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS ;#include <string.h> diff --git a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll index 5d7f67f1f9060..26fe4eb82ae49 100644 --- a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll +++ b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; Test case to trigger the hard way of creating a unique entering ; edge for the SCoP. It is triggered because the entering edge diff --git a/polly/test/CodeGen/memcpy_annotations.ll b/polly/test/CodeGen/memcpy_annotations.ll index c3ffe4abcddd6..501aa8fbea4d6 100644 --- a/polly/test/CodeGen/memcpy_annotations.ll +++ b/polly/test/CodeGen/memcpy_annotations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; Verify that @llvm.memcpy does not get a !alias.scope annotation. ; @llvm.memcpy takes two pointers, it is ambiguous to which the diff --git a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll index b084672971855..f63eb18118e77 100644 --- a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll +++ b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly --aa-pipeline= '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \ +; RUN: -S < %s | FileCheck %s ; CHECK: polly target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" diff --git a/polly/test/CodeGen/multidim-non-matching-typesize.ll b/polly/test/CodeGen/multidim-non-matching-typesize.ll index 66a4fdf42bc8e..63e43c83ada5f 100644 --- a/polly/test/CodeGen/multidim-non-matching-typesize.ll +++ b/polly/test/CodeGen/multidim-non-matching-typesize.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly --aa-pipeline= '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \ +; RUN: -S < %s | FileCheck %s target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" diff --git a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll index d3f8b718889e4..86b17573caada 100644 --- a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll +++ b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/CodeGen/multidim_alias_check.ll b/polly/test/CodeGen/multidim_alias_check.ll index e85d7c9e7785d..93e34e2fd0fc1 100644 --- a/polly/test/CodeGen/multidim_alias_check.ll +++ b/polly/test/CodeGen/multidim_alias_check.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: %polly.access.sext.A = sext i32 %n to i64 diff --git a/polly/test/CodeGen/multiple-codegens.ll b/polly/test/CodeGen/multiple-codegens.ll index cb12700bfb561..a63f8a615ff9e 100644 --- a/polly/test/CodeGen/multiple-codegens.ll +++ b/polly/test/CodeGen/multiple-codegens.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>,polly<no-default-opts>' -S < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=function(polly<no-default-opts;opt-isl>),function(polly<no-default-opts>)' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen,polly-codegen)" -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen),scop(polly-codegen)" -S < %s | FileCheck %s ; ; llvm.org/PR34441 ; Properly handle multiple -polly-scops/-polly-codegen in the same diff --git a/polly/test/CodeGen/multiple-scops-in-a-row.ll b/polly/test/CodeGen/multiple-scops-in-a-row.ll index b92359782d999..effae223c152a 100644 --- a/polly/test/CodeGen/multiple-scops-in-a-row.ll +++ b/polly/test/CodeGen/multiple-scops-in-a-row.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; This test case has two scops in a row. When code generating the first scop, ; the second scop is invalidated. This test case verifies that we do not crash diff --git a/polly/test/CodeGen/multiple-types-invariant-load-2.ll b/polly/test/CodeGen/multiple-types-invariant-load-2.ll index 96615079be365..101fcaff0c82e 100644 --- a/polly/test/CodeGen/multiple-types-invariant-load-2.ll +++ b/polly/test/CodeGen/multiple-types-invariant-load-2.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-allow-differing-element-types < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ +; RUN: -polly-allow-differing-element-types < %s | FileCheck %s ; CHECK: polly diff --git a/polly/test/CodeGen/multiple-types-invariant-load.ll b/polly/test/CodeGen/multiple-types-invariant-load.ll index ca89cb53e09b7..930041eaddaad 100644 --- a/polly/test/CodeGen/multiple-types-invariant-load.ll +++ b/polly/test/CodeGen/multiple-types-invariant-load.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-differing-element-types -passes=polly-codegen -S \ +; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s ; CHECK: %polly.access.global.load = getelementptr i32, ptr %global.load, i64 0 ; CHECK: %polly.access.global.load.load = load i32, ptr %polly.access.global.load diff --git a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll index 8198108b22059..1e06a7e186bb0 100644 --- a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll +++ b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -polly-position=before-vectorizer -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR ; The IR has two ScopArrayInfo for the value %next.0. This used to produce two ; phi nodes in polly.merge_new_and_old, one illegaly using the result of the diff --git a/polly/test/CodeGen/no-overflow-tracking.ll b/polly/test/CodeGen/no-overflow-tracking.ll index f915b5a0772e6..d5ad9a7aef239 100644 --- a/polly/test/CodeGen/no-overflow-tracking.ll +++ b/polly/test/CodeGen/no-overflow-tracking.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR ; ; As (p + q) can overflow we have to check that we load from ; I[p + q] only if it does not. diff --git a/polly/test/CodeGen/no_guard_bb.ll b/polly/test/CodeGen/no_guard_bb.ll index 604c5ac54bcdb..a022083f43a9e 100644 --- a/polly/test/CodeGen/no_guard_bb.ll +++ b/polly/test/CodeGen/no_guard_bb.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s ; ; CHECK-NOT: br i1 true, label %polly.{{.*}}, label %polly.{{.*}} ; diff --git a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll index ebb02a90ffb5d..6015516a3bc49 100644 --- a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll +++ b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; llvm.org/PR25439 ; Scalar reloads in the generated entering block were not recognized as diff --git a/polly/test/CodeGen/non-affine-exit-node-dominance.ll b/polly/test/CodeGen/non-affine-exit-node-dominance.ll index ff9f504295672..0d0f634ed7c16 100644 --- a/polly/test/CodeGen/non-affine-exit-node-dominance.ll +++ b/polly/test/CodeGen/non-affine-exit-node-dominance.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; llvm.org/PR25439 ; The dominance of the generated non-affine subregion block was based on the diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll index 2ad1e75216362..bfa3c156ea75d 100644 --- a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll +++ b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen \ +; RUN: -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll index 386fe5f9f207f..b9386333a79b4 100644 --- a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll +++ b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen \ +; RUN: -S < %s | FileCheck %s define void @foo(ptr %A, i1 %cond0, i1 %cond1) { entry: diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll index 5e5f34d99bde3..6460c427270f4 100644 --- a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll +++ b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen \ +; RUN: -S < %s | FileCheck %s define void @foo(ptr %A, i1 %cond0, i1 %cond1) { entry: diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion.ll b/polly/test/CodeGen/non-affine-phi-node-expansion.ll index db9f0d518041b..1b6802f1a4c35 100644 --- a/polly/test/CodeGen/non-affine-phi-node-expansion.ll +++ b/polly/test/CodeGen/non-affine-phi-node-expansion.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen \ +; RUN: -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.wombat = type {[4 x i32]} diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll index 096eb8609e1bb..007a4c586aa32 100644 --- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll +++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; This caused the code generation to generate invalid code as the same operand ; of the PHI node in the non-affine region was synthesized at the wrong place. diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll index 2810a8ab5361f..20edbf2bd6c03 100644 --- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll +++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; This caused the code generation to generate invalid code as the same BBMap was ; used for the whole non-affine region. When %add is synthesized for the diff --git a/polly/test/CodeGen/non-affine-region-implicit-store.ll b/polly/test/CodeGen/non-affine-region-implicit-store.ll index cdb2000d90d6b..0ff39d3fe882d 100644 --- a/polly/test/CodeGen/non-affine-region-implicit-store.ll +++ b/polly/test/CodeGen/non-affine-region-implicit-store.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; llvm.org/PR25438 ; After loop versioning, a dominance check of a non-affine subregion's exit node diff --git a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll index b4889c76079cc..7df3d8976ea80 100644 --- a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll +++ b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-loops -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops \ +; RUN: -S < %s | FileCheck %s ; This test verifies that values defined in another scop statement and used by ; PHI-nodes in non-affine regions are code generated correctly. diff --git a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll index 45465c627f55a..179062dd62d0a 100644 --- a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll +++ b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info \ +; RUN: < %s | FileCheck %s ; ; Check that we do not reuse the B[i-1] GEP created in block S again in ; block Q. Hence, we create two GEPs for B[i-1]: diff --git a/polly/test/CodeGen/non-affine-switch.ll b/polly/test/CodeGen/non-affine-switch.ll index 90d5efdc3a9f5..427e7e2461f1d 100644 --- a/polly/test/CodeGen/non-affine-switch.ll +++ b/polly/test/CodeGen/non-affine-switch.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly \ +; RUN: -S -passes=polly-codegen < %s | FileCheck %s ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll index 5bb4fd19f4fd1..292c0f2b53941 100644 --- a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll +++ b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -S < %s | FileCheck %s ; ; llvm.org/PR25412 ; %synthgep caused %gep to be synthesized in subregion_if which was reused for diff --git a/polly/test/CodeGen/non-affine-update.ll b/polly/test/CodeGen/non-affine-update.ll index 582607787eb7d..03f091a405017 100644 --- a/polly/test/CodeGen/non-affine-update.ll +++ b/polly/test/CodeGen/non-affine-update.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ +; RUN: -S < %s | FileCheck %s ; ; void non-affine-update(double A[], double C[], double B[]) { ; for (int i = 0; i < 10; i++) { diff --git a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll index eaf74d9c63e0e..153cdb7ed9f6c 100644 --- a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll +++ b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -disable-output %s +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/non_affine_float_compare.ll b/polly/test/CodeGen/non_affine_float_compare.ll index 9709e231a4e86..a359b662e6579 100644 --- a/polly/test/CodeGen/non_affine_float_compare.ll +++ b/polly/test/CodeGen/non_affine_float_compare.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-branches -S -verify-dom-info < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen \ +; RUN: -polly-allow-nonaffine-branches -S -verify-dom-info \ +; RUN: < %s | FileCheck %s ; ; void f(float *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/CodeGen/only_non_affine_error_region.ll b/polly/test/CodeGen/only_non_affine_error_region.ll index be7a8a23df869..445cef0d6f697 100644 --- a/polly/test/CodeGen/only_non_affine_error_region.ll +++ b/polly/test/CodeGen/only_non_affine_error_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; CHECK-NOT: polly.start ; diff --git a/polly/test/CodeGen/openmp_limit_threads.ll b/polly/test/CodeGen/openmp_limit_threads.ll index 730c57299d569..4c33be3407251 100644 --- a/polly/test/CodeGen/openmp_limit_threads.ll +++ b/polly/test/CodeGen/openmp_limit_threads.ll @@ -1,10 +1,10 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR ; Ensure that the provided thread numbers are forwarded to the OpenMP calls. ; diff --git a/polly/test/CodeGen/out-of-scop-phi-node-use.ll b/polly/test/CodeGen/out-of-scop-phi-node-use.ll index 8d5f74751af49..dd0a24b14a3b8 100644 --- a/polly/test/CodeGen/out-of-scop-phi-node-use.ll +++ b/polly/test/CodeGen/out-of-scop-phi-node-use.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/CodeGen/param_div_div_div_2.ll b/polly/test/CodeGen/param_div_div_div_2.ll index 3ae95020d52dd..8eba6444abb16 100644 --- a/polly/test/CodeGen/param_div_div_div_2.ll +++ b/polly/test/CodeGen/param_div_div_div_2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR ; ; Check that we guard the divisions because we moved them and thereby increased ; their domain. diff --git a/polly/test/CodeGen/partial_write_array.ll b/polly/test/CodeGen/partial_write_array.ll index fe5fd8cffece7..fad4b21cf3dc8 100644 --- a/polly/test/CodeGen/partial_write_array.ll +++ b/polly/test/CodeGen/partial_write_array.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; Partial write of an array access. ; diff --git a/polly/test/CodeGen/partial_write_emptyset.ll b/polly/test/CodeGen/partial_write_emptyset.ll index d0e5615e4220d..67828808e2fac 100644 --- a/polly/test/CodeGen/partial_write_emptyset.ll +++ b/polly/test/CodeGen/partial_write_emptyset.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; Partial write, where "partial" is the empty set. ; The store is never executed in this case and we do generate it in the diff --git a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll index a36414297485a..b26bd81b5663b 100644 --- a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll +++ b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; CHECK: polly.stmt.if.then81: ; preds = %polly.stmt.if.end75 ; CHECK-NEXT: store float undef, ptr %fX64, align 4, !alias.scope !0, !noalias !3 diff --git a/polly/test/CodeGen/partial_write_impossible_restriction.ll b/polly/test/CodeGen/partial_write_impossible_restriction.ll index e0069ebc8eae8..7577b137a2750 100644 --- a/polly/test/CodeGen/partial_write_impossible_restriction.ll +++ b/polly/test/CodeGen/partial_write_impossible_restriction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; The isl scheduler isolates %cond.false into two instances. ; A partial write access in one of the instances was never executed, diff --git a/polly/test/CodeGen/partial_write_in_region.ll b/polly/test/CodeGen/partial_write_in_region.ll index e7f4225cf9310..7c138c82091e5 100644 --- a/polly/test/CodeGen/partial_write_in_region.ll +++ b/polly/test/CodeGen/partial_write_in_region.ll @@ -1,4 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -verify-dom-info -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ +; RUN: -polly-import-jscop-postfix=transformed \ +; RUN: -verify-dom-info \ +; RUN: -S < %s | FileCheck %s ; ; void foo(long A[], float B[], float C[]) { ; for (long i = 0; i < 1024; i++) { diff --git a/polly/test/CodeGen/partial_write_in_region_with_loop.ll b/polly/test/CodeGen/partial_write_in_region_with_loop.ll index 85b56fefad809..ba15a7871f431 100644 --- a/polly/test/CodeGen/partial_write_in_region_with_loop.ll +++ b/polly/test/CodeGen/partial_write_in_region_with_loop.ll @@ -1,4 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -verify-dom-info -polly-allow-nonaffine-loops -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \ +; RUN: -polly-import-jscop-postfix=transformed \ +; RUN: -verify-dom-info -polly-allow-nonaffine-loops \ +; RUN: -S < %s | FileCheck %s ; This test verifies that partial writes within non-affine loops are code ; generated correctly. diff --git a/polly/test/CodeGen/partial_write_mapped_scalar.ll b/polly/test/CodeGen/partial_write_mapped_scalar.ll index bb99d4ea086d2..b8c413885cdb0 100644 --- a/polly/test/CodeGen/partial_write_mapped_scalar.ll +++ b/polly/test/CodeGen/partial_write_mapped_scalar.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; Partial write of a (mapped) scalar. ; diff --git a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll index 37a9d98c6a22e..8c1953a05ad3c 100644 --- a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll +++ b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; Partial write of a (mapped) scalar in a non-affine subregion. ; diff --git a/polly/test/CodeGen/perf_monitoring.ll b/polly/test/CodeGen/perf_monitoring.ll index 61f122228c377..4b91e5055c0b1 100644 --- a/polly/test/CodeGen/perf_monitoring.ll +++ b/polly/test/CodeGen/perf_monitoring.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \ +; RUN: -S < %s | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll index 4c47a12c12904..d5c33d64f3418 100644 --- a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll +++ b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \ +; RUN: -S < %s | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll index 6d09d8bf27ebe..ab99c4d2de062 100644 --- a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll +++ b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \ +; RUN: -S < %s | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/phi-defined-before-scop.ll b/polly/test/CodeGen/phi-defined-before-scop.ll index 2ccd7965bbeaf..447a14e9999c2 100644 --- a/polly/test/CodeGen/phi-defined-before-scop.ll +++ b/polly/test/CodeGen/phi-defined-before-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; CHECK-LABEL: polly.merge_new_and_old: ; CHECK-NEXT: %tmp7.ph.merge = phi ptr [ %tmp7.ph.final_reload, %polly.exiting ], [ %tmp7.ph, %bb6.region_exiting ] diff --git a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll index 1655104b08390..e096aa2f4f8c0 100644 --- a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll +++ b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; Make sure code generation does not break in case an 'error block' is detected ; outside of the scope. In this situation, we should not affect code generation. diff --git a/polly/test/CodeGen/phi_condition_modeling_1.ll b/polly/test/CodeGen/phi_condition_modeling_1.ll index 1cadac0a5cf73..9d73d8a792558 100644 --- a/polly/test/CodeGen/phi_condition_modeling_1.ll +++ b/polly/test/CodeGen/phi_condition_modeling_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; void f(int *A, int c, int N) { ; int tmp; diff --git a/polly/test/CodeGen/phi_condition_modeling_2.ll b/polly/test/CodeGen/phi_condition_modeling_2.ll index 8f2e2a517c96c..2d1364842d735 100644 --- a/polly/test/CodeGen/phi_condition_modeling_2.ll +++ b/polly/test/CodeGen/phi_condition_modeling_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; void f(int *A, int c, int N) { ; int tmp; diff --git a/polly/test/CodeGen/phi_conditional_simple_1.ll b/polly/test/CodeGen/phi_conditional_simple_1.ll index 5f0f8de19f223..25bcf2a118ef4 100644 --- a/polly/test/CodeGen/phi_conditional_simple_1.ll +++ b/polly/test/CodeGen/phi_conditional_simple_1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; void jd(int *A, int c) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll index 703e55f15c084..43d29b9ec8649 100644 --- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll +++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; This caused an lnt crash at some point, just verify it will run through. ; diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll index 3d911e0d6a87f..9f28024fcfa0a 100644 --- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll +++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; This caused an lnt crash at some point, just verify it will run through and ; produce the PHI node in the exit we are looking for. diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll index 5f81f52078723..73e99ac0f32c5 100644 --- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll +++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; This caused an lnt crash at some point, just verify it will run through and ; produce the PHI node in the exit we are looking for. diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll index abb86e650ce2a..6c9bd56a98722 100644 --- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll +++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; This caused an lnt crash at some point, just verify it will run through and ; produce the PHI node in the exit we are looking for. diff --git a/polly/test/CodeGen/phi_loop_carried_float.ll b/polly/test/CodeGen/phi_loop_carried_float.ll index 47a8a8190c8d9..4cb392d3353d3 100644 --- a/polly/test/CodeGen/phi_loop_carried_float.ll +++ b/polly/test/CodeGen/phi_loop_carried_float.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; float f(float *A, int N) { ; float tmp = 0; diff --git a/polly/test/CodeGen/phi_loop_carried_float_escape.ll b/polly/test/CodeGen/phi_loop_carried_float_escape.ll index 81dd5cecd1878..9fd8ad413128a 100644 --- a/polly/test/CodeGen/phi_loop_carried_float_escape.ll +++ b/polly/test/CodeGen/phi_loop_carried_float_escape.ll @@ -1,6 +1,8 @@ -; RUN: opt %loadNPMPolly -S -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S \ +; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen < %s | FileCheck %s -; RUN: opt %loadNPMPolly -S -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S \ +; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen < %s | FileCheck %s ; ; float f(float *A, int N) { ; float tmp = 0; diff --git a/polly/test/CodeGen/phi_scalar_simple_1.ll b/polly/test/CodeGen/phi_scalar_simple_1.ll index 6331c24da31b0..80a1c41b83ac0 100644 --- a/polly/test/CodeGen/phi_scalar_simple_1.ll +++ b/polly/test/CodeGen/phi_scalar_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; int jd(int *restrict A, int x, int N) { ; for (int i = 1; i < N; i++) diff --git a/polly/test/CodeGen/phi_scalar_simple_2.ll b/polly/test/CodeGen/phi_scalar_simple_2.ll index 0adadf6b90159..614c8acfb9f8e 100644 --- a/polly/test/CodeGen/phi_scalar_simple_2.ll +++ b/polly/test/CodeGen/phi_scalar_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; int jd(int *restrict A, int x, int N, int c) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll index 4d6ede638c8f2..7e21666f1db00 100644 --- a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll +++ b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; CHECK: polly.merge_new_and_old: ; CHECK: %result.ph.merge = phi float [ %result.ph.final_reload, %polly.exiting ], [ %result.ph, %next.region_exiting ] diff --git a/polly/test/CodeGen/phi_with_one_exit_edge.ll b/polly/test/CodeGen/phi_with_one_exit_edge.ll index 4de24fb058c26..36a8684dbc37a 100644 --- a/polly/test/CodeGen/phi_with_one_exit_edge.ll +++ b/polly/test/CodeGen/phi_with_one_exit_edge.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; ; CHECK: polly.merge_new_and_old: diff --git a/polly/test/CodeGen/pointer-type-expressions-2.ll b/polly/test/CodeGen/pointer-type-expressions-2.ll index 706b01d7f8ca5..918e4c6c9c0b0 100644 --- a/polly/test/CodeGen/pointer-type-expressions-2.ll +++ b/polly/test/CodeGen/pointer-type-expressions-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define void @foo(ptr %start, ptr %end) { diff --git a/polly/test/CodeGen/pointer-type-expressions.ll b/polly/test/CodeGen/pointer-type-expressions.ll index 2478e2238fd0e..e7feebc163d4b 100644 --- a/polly/test/CodeGen/pointer-type-expressions.ll +++ b/polly/test/CodeGen/pointer-type-expressions.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN ; void f(int a[], int N, float *P) { ; int i; diff --git a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll index cac6f4fdd16f1..9ee050a1e5070 100644 --- a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll +++ b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN ; ; void f(int a[], int N, float *P, float *Q) { diff --git a/polly/test/CodeGen/pointer_rem.ll b/polly/test/CodeGen/pointer_rem.ll index ca5d866ae6cce..b8202318a3eca 100644 --- a/polly/test/CodeGen/pointer_rem.ll +++ b/polly/test/CodeGen/pointer_rem.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<ast>' -polly-print-scops -polly-print-ast -disable-output -S < %s | FileCheck %s --check-prefix=AST -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<codegen>' -polly-print-scops -S < %s | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(print<polly-ast>)' -disable-output -S < %s | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(polly-codegen)' -S < %s | FileCheck %s --check-prefix=CODEGEN target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128" target triple = "aarch64--linux-gnu" diff --git a/polly/test/CodeGen/pr25241.ll b/polly/test/CodeGen/pr25241.ll index 94be6d7824921..7547b0bbed749 100644 --- a/polly/test/CodeGen/pr25241.ll +++ b/polly/test/CodeGen/pr25241.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; PR25241 (https://llvm.org/bugs/show_bug.cgi?id=25241) ; Ensure that synthesized values of a PHI node argument are generated in the diff --git a/polly/test/CodeGen/ptrtoint_as_parameter.ll b/polly/test/CodeGen/ptrtoint_as_parameter.ll index 49a8c38309eb2..a551d810c0802 100644 --- a/polly/test/CodeGen/ptrtoint_as_parameter.ll +++ b/polly/test/CodeGen/ptrtoint_as_parameter.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; CHECK: if.then260: ; CHECK-NEXT: %p.4 = getelementptr inbounds i8, ptr null, i64 1 diff --git a/polly/test/CodeGen/read-only-scalars.ll b/polly/test/CodeGen/read-only-scalars.ll index 2ae0f9e797bd1..365cbbce495fb 100644 --- a/polly/test/CodeGen/read-only-scalars.ll +++ b/polly/test/CodeGen/read-only-scalars.ll @@ -1,5 +1,9 @@ -; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=SCALAR +; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false -passes=polly-codegen \ +; RUN: \ +; RUN: -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true -passes=polly-codegen \ +; RUN: \ +; RUN: -S < %s | FileCheck %s -check-prefix=SCALAR ; CHECK-NOT: alloca diff --git a/polly/test/CodeGen/reduction.ll b/polly/test/CodeGen/reduction.ll index 21d8c0f98b702..8c5f70770a1c5 100644 --- a/polly/test/CodeGen/reduction.ll +++ b/polly/test/CodeGen/reduction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | not FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | not FileCheck %s ;#include <string.h> ;#include <stdio.h> diff --git a/polly/test/CodeGen/reduction_2.ll b/polly/test/CodeGen/reduction_2.ll index f9576826b4f77..060a1866870e4 100644 --- a/polly/test/CodeGen/reduction_2.ll +++ b/polly/test/CodeGen/reduction_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --allow-empty +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --allow-empty ;#include <string.h> ;#include <stdio.h> diff --git a/polly/test/CodeGen/reduction_simple_binary.ll b/polly/test/CodeGen/reduction_simple_binary.ll index 53cbdf407c954..0fe1085dbbacd 100644 --- a/polly/test/CodeGen/reduction_simple_binary.ll +++ b/polly/test/CodeGen/reduction_simple_binary.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: pragma simd reduction ; diff --git a/polly/test/CodeGen/reggen_domtree_crash.ll b/polly/test/CodeGen/reggen_domtree_crash.ll index 9d5ba4c4ff9fb..58c27091a22c3 100644 --- a/polly/test/CodeGen/reggen_domtree_crash.ll +++ b/polly/test/CodeGen/reggen_domtree_crash.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s ; CHECK: define ptr @ham(ptr %arg, i64 %arg1, i1 %arg2) diff --git a/polly/test/CodeGen/region-with-instructions.ll b/polly/test/CodeGen/region-with-instructions.ll index f061ac061e226..e5f7d0f9ef5d6 100644 --- a/polly/test/CodeGen/region-with-instructions.ll +++ b/polly/test/CodeGen/region-with-instructions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; CHECK-LABEL: polly.stmt.bb48: ; CHECK-NEXT: %[[offset:.*]] = shl i64 %polly.indvar, 3 diff --git a/polly/test/CodeGen/region_exiting-domtree.ll b/polly/test/CodeGen/region_exiting-domtree.ll index 16b265c064790..06e0d9df3d951 100644 --- a/polly/test/CodeGen/region_exiting-domtree.ll +++ b/polly/test/CodeGen/region_exiting-domtree.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s ; Verify that the DominatorTree is preserved correctly for the inserted ; %polly.stmt.exit.exit block, which serves as new exit block for the generated diff --git a/polly/test/CodeGen/region_multiexit_partialwrite.ll b/polly/test/CodeGen/region_multiexit_partialwrite.ll index 9d21d16c9f9cd..39e04dbf93ac7 100644 --- a/polly/test/CodeGen/region_multiexit_partialwrite.ll +++ b/polly/test/CodeGen/region_multiexit_partialwrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s ; ; This text case has a partial write of PHI in a region-statement. It ; requires that the new PHINode from the region's exiting block is diff --git a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll index 7984b7ce80209..4afaab5bbad0a 100644 --- a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll +++ b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; TODO: FIXME: Simplify the context. ; AST: if (n >= 1 && 0 == n <= -1) diff --git a/polly/test/CodeGen/run-time-condition.ll b/polly/test/CodeGen/run-time-condition.ll index 44d2a4f15b378..914b76f5e0be7 100644 --- a/polly/test/CodeGen/run-time-condition.ll +++ b/polly/test/CodeGen/run-time-condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll index 102ef04128133..77306c1046133 100644 --- a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll +++ b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; Test the code generation in the presence of a scalar out-of-scop value being ; used from within the SCoP. diff --git a/polly/test/CodeGen/scalar-store-from-same-bb.ll b/polly/test/CodeGen/scalar-store-from-same-bb.ll index 1988f77086c8a..0c1164b245a43 100644 --- a/polly/test/CodeGen/scalar-store-from-same-bb.ll +++ b/polly/test/CodeGen/scalar-store-from-same-bb.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly \ +; RUN: -passes=polly-codegen -S < %s | FileCheck %s ; This test ensures that the expression N + 1 that is stored in the phi-node ; alloca, is directly computed and not incorrectly transferred through memory. diff --git a/polly/test/CodeGen/scalar_codegen_crash.ll b/polly/test/CodeGen/scalar_codegen_crash.ll index 0179072391a33..375f097283b07 100644 --- a/polly/test/CodeGen/scalar_codegen_crash.ll +++ b/polly/test/CodeGen/scalar_codegen_crash.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly \ +; RUN: -passes=polly-codegen -S < %s | FileCheck %s ; This test cases used to crash the scalar code generation. Check that we ; can generate code for it. diff --git a/polly/test/CodeGen/scev-backedgetaken.ll b/polly/test/CodeGen/scev-backedgetaken.ll index 09fcfe3e4a09c..e0941690ae489 100644 --- a/polly/test/CodeGen/scev-backedgetaken.ll +++ b/polly/test/CodeGen/scev-backedgetaken.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; llvm.org/PR48422 ; Use of ScalarEvolution in Codegen not possible because DominatorTree is not updated. diff --git a/polly/test/CodeGen/scev-division-invariant-load.ll b/polly/test/CodeGen/scev-division-invariant-load.ll index 5942ecbe7cee9..70f090eae07b3 100644 --- a/polly/test/CodeGen/scev-division-invariant-load.ll +++ b/polly/test/CodeGen/scev-division-invariant-load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s ; ; Check that we generate valid code as we did not use the preloaded ; value of %tmp1 for the access function of the preloaded %tmp4. diff --git a/polly/test/CodeGen/scev.ll b/polly/test/CodeGen/scev.ll index a09d8c5504b1b..e2b5afda1bfff 100644 --- a/polly/test/CodeGen/scev.ll +++ b/polly/test/CodeGen/scev.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define fastcc void @f () inlinehint align 2 { diff --git a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll index 095c362024a83..0adb0ba7eea81 100644 --- a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll +++ b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ +; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s ; bugpoint-reduced testcase of MiBench/consumer-lame/quantize-pvt.c from the ; test-suite. diff --git a/polly/test/CodeGen/scev_looking_through_bitcasts.ll b/polly/test/CodeGen/scev_looking_through_bitcasts.ll index 81f4b96d22a37..142e83f820fe7 100644 --- a/polly/test/CodeGen/scev_looking_through_bitcasts.ll +++ b/polly/test/CodeGen/scev_looking_through_bitcasts.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; Scalar write of bitcasted value. Instead of writing %b of type ; %structty, the SCEV expression looks through the bitcast such that diff --git a/polly/test/CodeGen/scop_expander_insert_point.ll b/polly/test/CodeGen/scop_expander_insert_point.ll index 1cba7567a5e43..fd73132258ddc 100644 --- a/polly/test/CodeGen/scop_expander_insert_point.ll +++ b/polly/test/CodeGen/scop_expander_insert_point.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ +; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s ; ; CHECK: entry: ; CHECK-NEXT: %outvalue.141.phiops = alloca i64 diff --git a/polly/test/CodeGen/scop_expander_segfault.ll b/polly/test/CodeGen/scop_expander_segfault.ll index 56d37a0175853..d94a1fdfb2c12 100644 --- a/polly/test/CodeGen/scop_expander_segfault.ll +++ b/polly/test/CodeGen/scop_expander_segfault.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S %s | FileCheck %s ; ; This test was extracted from gcc in SPEC2006 and it crashed our code ; generation, or to be more precise, the ScopExpander due to a endless diff --git a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll index cdcfe838fa915..9f968e5657c90 100644 --- a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll +++ b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; Verify that we generate the runtime check code after the conditional branch ; in the SCoP region entering block (here %entry). diff --git a/polly/test/CodeGen/select-base-pointer.ll b/polly/test/CodeGen/select-base-pointer.ll index 144c05b5effba..85be37755c474 100644 --- a/polly/test/CodeGen/select-base-pointer.ll +++ b/polly/test/CodeGen/select-base-pointer.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -disable-output %s +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s ; ; Check that we do not crash here. ; diff --git a/polly/test/CodeGen/sequential_loops.ll b/polly/test/CodeGen/sequential_loops.ll index eeb3048007859..33a3ee9fbbd47 100644 --- a/polly/test/CodeGen/sequential_loops.ll +++ b/polly/test/CodeGen/sequential_loops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ;#include <string.h> ;#define N 1024 diff --git a/polly/test/CodeGen/simple_loop_non_single_exit.ll b/polly/test/CodeGen/simple_loop_non_single_exit.ll index 1b3518bdb0cba..a7e36bc4c7330 100644 --- a/polly/test/CodeGen/simple_loop_non_single_exit.ll +++ b/polly/test/CodeGen/simple_loop_non_single_exit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll index 3af9913e6aa04..22e9da09ef857 100644 --- a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll +++ b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/simple_non_single_entry.ll b/polly/test/CodeGen/simple_non_single_entry.ll index 8800dc7214b06..c33a77ae07939 100644 --- a/polly/test/CodeGen/simple_non_single_entry.ll +++ b/polly/test/CodeGen/simple_non_single_entry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE ; void f(long A[], long N) { ; long i; diff --git a/polly/test/CodeGen/simple_nonaffine_loop.ll b/polly/test/CodeGen/simple_nonaffine_loop.ll index 5b1cd1991cd73..bc62047a80a34 100644 --- a/polly/test/CodeGen/simple_nonaffine_loop.ll +++ b/polly/test/CodeGen/simple_nonaffine_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-allow-nonaffine -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s ;#include <stdio.h> ;#include <stdlib.h> diff --git a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll index f0142f726efa4..a65e3a25f035a 100644 --- a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll +++ b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ;#define N 20 ;#include "limits.h" diff --git a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll index cc5e7b221026c..acccb48f18a3c 100644 --- a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll +++ b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ; XFAIL: * ;define N 20 diff --git a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll index 1299362369478..7a67f6ba96ce2 100644 --- a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll +++ b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s ;#define N 20 ;#include "limits.h" diff --git a/polly/test/CodeGen/single_do_loop_one_iteration.ll b/polly/test/CodeGen/single_do_loop_one_iteration.ll index d025ef2116a40..2d939167b71ee 100644 --- a/polly/test/CodeGen/single_do_loop_one_iteration.ll +++ b/polly/test/CodeGen/single_do_loop_one_iteration.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ; XFAIL: * ;#define N 20 diff --git a/polly/test/CodeGen/single_do_loop_scev_replace.ll b/polly/test/CodeGen/single_do_loop_scev_replace.ll index b473e266343a3..83c9e9d0324ce 100644 --- a/polly/test/CodeGen/single_do_loop_scev_replace.ll +++ b/polly/test/CodeGen/single_do_loop_scev_replace.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ;#define N 20 ;#include "limits.h" diff --git a/polly/test/CodeGen/single_loop.ll b/polly/test/CodeGen/single_loop.ll index c04738e6843a0..2db34663e93ce 100644 --- a/polly/test/CodeGen/single_loop.ll +++ b/polly/test/CodeGen/single_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ;#include <string.h> ;#define N 1024 diff --git a/polly/test/CodeGen/single_loop_int_max_iterations.ll b/polly/test/CodeGen/single_loop_int_max_iterations.ll index 82ec7ffd85462..f83e8823c63df 100644 --- a/polly/test/CodeGen/single_loop_int_max_iterations.ll +++ b/polly/test/CodeGen/single_loop_int_max_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ;#define N 20 ;#include "limits.h" diff --git a/polly/test/CodeGen/single_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_loop_ll_max_iterations.ll index 8affb71fad649..1427189d74a7d 100644 --- a/polly/test/CodeGen/single_loop_ll_max_iterations.ll +++ b/polly/test/CodeGen/single_loop_ll_max_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ;#include "limits.h" ;#define N 20 diff --git a/polly/test/CodeGen/single_loop_one_iteration.ll b/polly/test/CodeGen/single_loop_one_iteration.ll index 307b8358ff980..1a70d4a879d83 100644 --- a/polly/test/CodeGen/single_loop_one_iteration.ll +++ b/polly/test/CodeGen/single_loop_one_iteration.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ;#define N 20 ; diff --git a/polly/test/CodeGen/single_loop_param.ll b/polly/test/CodeGen/single_loop_param.ll index 1d78c7a7329d4..44ce1236e9f84 100644 --- a/polly/test/CodeGen/single_loop_param.ll +++ b/polly/test/CodeGen/single_loop_param.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @A = common global [1024 x i32] zeroinitializer, align 16 ; <ptr> [#uses=3] diff --git a/polly/test/CodeGen/single_loop_param_less_equal.ll b/polly/test/CodeGen/single_loop_param_less_equal.ll index 5fad1d43ae0d7..fda9bfab11b8f 100644 --- a/polly/test/CodeGen/single_loop_param_less_equal.ll +++ b/polly/test/CodeGen/single_loop_param_less_equal.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @A = common global [1024 x i32] zeroinitializer diff --git a/polly/test/CodeGen/single_loop_param_less_than.ll b/polly/test/CodeGen/single_loop_param_less_than.ll index 75a8cb2094a16..b888c860eacd0 100644 --- a/polly/test/CodeGen/single_loop_param_less_than.ll +++ b/polly/test/CodeGen/single_loop_param_less_than.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @A = common global [1024 x i32] zeroinitializer diff --git a/polly/test/CodeGen/single_loop_zero_iterations.ll b/polly/test/CodeGen/single_loop_zero_iterations.ll index 3194dba52190b..b1ce491b5c8a2 100644 --- a/polly/test/CodeGen/single_loop_zero_iterations.ll +++ b/polly/test/CodeGen/single_loop_zero_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty ;#define N 20 ; diff --git a/polly/test/CodeGen/split_edge_of_exit.ll b/polly/test/CodeGen/split_edge_of_exit.ll index 73d6006a6b621..f4b17e687ada6 100644 --- a/polly/test/CodeGen/split_edge_of_exit.ll +++ b/polly/test/CodeGen/split_edge_of_exit.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -disable-output < %s ; ; This is a scop directly precedented by a region, i.e. the scop's entry is the ; region's exit block. This test is to ensure that the RegionInfo is correctly diff --git a/polly/test/CodeGen/split_edges.ll b/polly/test/CodeGen/split_edges.ll index 03363f49ce800..b921202285bb2 100644 --- a/polly/test/CodeGen/split_edges.ll +++ b/polly/test/CodeGen/split_edges.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -verify-dom-info -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @A = common global [1536 x float] zeroinitializer diff --git a/polly/test/CodeGen/split_edges_2.ll b/polly/test/CodeGen/split_edges_2.ll index 59df1618cfd71..8f4d48f5dcb00 100644 --- a/polly/test/CodeGen/split_edges_2.ll +++ b/polly/test/CodeGen/split_edges_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -verify-dom-info -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/CodeGen/srem-in-other-bb.ll b/polly/test/CodeGen/srem-in-other-bb.ll index 177d86adb9066..a13a1b6ab98f2 100644 --- a/polly/test/CodeGen/srem-in-other-bb.ll +++ b/polly/test/CodeGen/srem-in-other-bb.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ +; RUN: < %s | FileCheck %s ; ; void pos(float *A, long n) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll index 5a490b68b9a9f..b49c4e12fe11a 100644 --- a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll +++ b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -verify-dom-info '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s +; RUN: opt %loadNPMPolly -verify-dom-info -passes=polly-codegen -S < %s \ +; RUN: -polly-invariant-load-hoisting=true | FileCheck %s ; ; This caused an infinite recursion during invariant load hoisting at some ; point. Check it does not and we add a "false" runtime check. diff --git a/polly/test/CodeGen/stmt_split_no_dependence.ll b/polly/test/CodeGen/stmt_split_no_dependence.ll index d41e4a87bfb65..bb878cc342af8 100644 --- a/polly/test/CodeGen/stmt_split_no_dependence.ll +++ b/polly/test/CodeGen/stmt_split_no_dependence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; CHECK: store i32 %9, ptr %scevgep, align 4, !alias.scope !3, !noalias !6 ; CHECK: store i32 %11, ptr %scevgep4, align 4, !alias.scope !6, !noalias !3 diff --git a/polly/test/CodeGen/switch-in-non-affine-region.ll b/polly/test/CodeGen/switch-in-non-affine-region.ll index 6696efca63f02..1a9e7081bebdc 100644 --- a/polly/test/CodeGen/switch-in-non-affine-region.ll +++ b/polly/test/CodeGen/switch-in-non-affine-region.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly \ +; RUN: -S -passes=polly-codegen < %s | FileCheck %s ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll index 86395f25db1a8..b2a062363eef4 100644 --- a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll +++ b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; Check for the correct written value of a scalar phi write whose value is ; defined within the loop, but its effective value is its last definition when diff --git a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll index b5172badd76dc..5668063c27c8e 100644 --- a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll +++ b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -verify-loop-info < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen -verify-loop-info < %s | FileCheck %s ; ; Check that we do not crash as described here: http://llvm.org/bugs/show_bug.cgi?id=21167 ; diff --git a/polly/test/CodeGen/test-invalid-operands-for-select.ll b/polly/test/CodeGen/test-invalid-operands-for-select.ll index 39cadc78f7e36..fdc98fbb4d9e7 100644 --- a/polly/test/CodeGen/test-invalid-operands-for-select.ll +++ b/polly/test/CodeGen/test-invalid-operands-for-select.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; Check that we do not crash as described here: http://llvm.org/PR21167 ; diff --git a/polly/test/CodeGen/test.ll b/polly/test/CodeGen/test.ll index 7c28ca4860e79..aad998ba2728b 100644 --- a/polly/test/CodeGen/test.ll +++ b/polly/test/CodeGen/test.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ; XFAIL: * ;int bar1(); diff --git a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll index d97a632fc382e..1c68389eaeba8 100644 --- a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll +++ b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; CHECK: polly.merge_new_and_old: ; CHECK-NEXT: merge = phi diff --git a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll index 845d106d43b0e..4396c38310dce 100644 --- a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll +++ b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; CHECK-LABEL: for.cond: ; CHECK: %num.0 = phi i32 [ %add, %for.body15 ], [ 0, %for.cond.pre_entry_bb ] diff --git a/polly/test/CodeGen/two-scops-in-row.ll b/polly/test/CodeGen/two-scops-in-row.ll index 4b9d49cb02ec6..dd3f310ef1502 100644 --- a/polly/test/CodeGen/two-scops-in-row.ll +++ b/polly/test/CodeGen/two-scops-in-row.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ignore-aliasing -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -disable-output < %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; SCALAR: if ( diff --git a/polly/test/CodeGen/udiv_expansion_position.ll b/polly/test/CodeGen/udiv_expansion_position.ll index 2a3ba8ae45757..354e3cd180107 100644 --- a/polly/test/CodeGen/udiv_expansion_position.ll +++ b/polly/test/CodeGen/udiv_expansion_position.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s ; ; Verify we do not crash when we synthezise code for the udiv in the SCoP. ; diff --git a/polly/test/CodeGen/uninitialized_scalar_memory.ll b/polly/test/CodeGen/uninitialized_scalar_memory.ll index ad0e6ca7e350b..e08af07e604e8 100644 --- a/polly/test/CodeGen/uninitialized_scalar_memory.ll +++ b/polly/test/CodeGen/uninitialized_scalar_memory.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s ; ; Verify we initialize the scalar locations reserved for the incoming phi ; values. diff --git a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll index e7f4d601edab5..46706804a81b0 100644 --- a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll +++ b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll @@ -1,5 +1,7 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen \ +; RUN: -polly-invariant-load-hoisting=true -disable-output < %s ; The loop for.body is a scop with invariant load hoisting, but does not ; terminate predictably for ScalarEvolution. The scalar %1 therefore is not diff --git a/polly/test/CodeGen/variant_load_empty_domain.ll b/polly/test/CodeGen/variant_load_empty_domain.ll index d1f4450d086e0..6f2d3dc582db3 100644 --- a/polly/test/CodeGen/variant_load_empty_domain.ll +++ b/polly/test/CodeGen/variant_load_empty_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s ; ; ; void f(int *A) { diff --git a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll index 44f6dbcd34d1d..b342b1cb5aa27 100644 --- a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll +++ b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly \ +; RUN: -passes=polly-codegen -S < %s | FileCheck %s ; CHECK: polly.start ; int /* pure */ g() diff --git a/polly/test/DeLICM/confused_order.ll b/polly/test/DeLICM/confused_order.ll index de340ef48d16e..0c19eb6aa605a 100644 --- a/polly/test/DeLICM/confused_order.ll +++ b/polly/test/DeLICM/confused_order.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-delicm>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-delicm' -polly-import-jscop-postfix=transformed -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS ; ; ForwardOptree changes the SCoP and may already map some accesses. ; DeLICM must be prepared to encounter implicit reads diff --git a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll index ba42692febab2..66d9ae889e657 100644 --- a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll +++ b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; ; The domain of bb14 contradicts the SCoP's assumptions. This leads to ; 'anything goes' inside the statement since it is never executed, diff --git a/polly/test/DeLICM/load-in-cond-inf-loop.ll b/polly/test/DeLICM/load-in-cond-inf-loop.ll index 19cc334f70054..a78a4691bb0d5 100644 --- a/polly/test/DeLICM/load-in-cond-inf-loop.ll +++ b/polly/test/DeLICM/load-in-cond-inf-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; When %b is 0, %for.body13 is an infinite loop. In this case the loaded ; value %1 is not used anywhere. diff --git a/polly/test/DeLICM/map_memset_zero.ll b/polly/test/DeLICM/map_memset_zero.ll index cc4e0ab387d2a..9a8e5989fdad1 100644 --- a/polly/test/DeLICM/map_memset_zero.ll +++ b/polly/test/DeLICM/map_memset_zero.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s ; ; Check that PHI mapping works even in presence of a memset whose' ; zero value is used. diff --git a/polly/test/DeLICM/nomap_alreadymapped.ll b/polly/test/DeLICM/nomap_alreadymapped.ll index 9e49300381b57..da5f4ec24a47e 100644 --- a/polly/test/DeLICM/nomap_alreadymapped.ll +++ b/polly/test/DeLICM/nomap_alreadymapped.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/nomap_escaping.ll b/polly/test/DeLICM/nomap_escaping.ll index 6460dbdb808fb..60955368fe59c 100644 --- a/polly/test/DeLICM/nomap_escaping.ll +++ b/polly/test/DeLICM/nomap_escaping.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/nomap_occupied.ll b/polly/test/DeLICM/nomap_occupied.ll index 72eea57b8fdf5..9ba8ce2641231 100644 --- a/polly/test/DeLICM/nomap_occupied.ll +++ b/polly/test/DeLICM/nomap_occupied.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/nomap_readonly.ll b/polly/test/DeLICM/nomap_readonly.ll index 67bac06f1505f..7a185d336bad3 100644 --- a/polly/test/DeLICM/nomap_readonly.ll +++ b/polly/test/DeLICM/nomap_readonly.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; fsomeval = 21.0 + 21.0; diff --git a/polly/test/DeLICM/nomap_spuriouswrite.ll b/polly/test/DeLICM/nomap_spuriouswrite.ll index f3fcb0ccd06e4..0ed7f6ee8e239 100644 --- a/polly/test/DeLICM/nomap_spuriouswrite.ll +++ b/polly/test/DeLICM/nomap_spuriouswrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/nomap_storagesize.ll b/polly/test/DeLICM/nomap_storagesize.ll index 0f2943a5b1417..bf851ac342d20 100644 --- a/polly/test/DeLICM/nomap_storagesize.ll +++ b/polly/test/DeLICM/nomap_storagesize.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; ; void func(float *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/nomap_writewrite.ll b/polly/test/DeLICM/nomap_writewrite.ll index fc8459a34972c..9fcd52aad743c 100644 --- a/polly/test/DeLICM/nomap_writewrite.ll +++ b/polly/test/DeLICM/nomap_writewrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/outofquota-reverseDomain.ll b/polly/test/DeLICM/outofquota-reverseDomain.ll index d48665bdc29c1..1f7527c841208 100644 --- a/polly/test/DeLICM/outofquota-reverseDomain.ll +++ b/polly/test/DeLICM/outofquota-reverseDomain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-delicm-max-ops=1000000 '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-delicm-max-ops=1000000 '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; ; This causes an assertion to fail on out-of-quota after 1000000 operations. ; (The error was specific to -polly-delicm-max-ops=1000000 and changes diff --git a/polly/test/DeLICM/pass_existence.ll b/polly/test/DeLICM/pass_existence.ll index d784655db60f3..64302d9983261 100644 --- a/polly/test/DeLICM/pass_existence.ll +++ b/polly/test/DeLICM/pass_existence.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -disable-output < %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-delicm -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=scop(print<polly-delicm>)' -disable-output < %s | FileCheck %s ; ; Simple test for the existence of the DeLICM pass. ; diff --git a/polly/test/DeLICM/pr41656.ll b/polly/test/DeLICM/pr41656.ll index 82799e4fd1ab8..2a92503809a24 100644 --- a/polly/test/DeLICM/pr41656.ll +++ b/polly/test/DeLICM/pr41656.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-scops -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s ; ; llvm.org/PR41656 ; diff --git a/polly/test/DeLICM/pr48783.ll b/polly/test/DeLICM/pr48783.ll index 10f8b64c3dd2f..deba8bfcc5daf 100644 --- a/polly/test/DeLICM/pr48783.ll +++ b/polly/test/DeLICM/pr48783.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-scops -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s ; ; llvm.org/PR48783 ; diff --git a/polly/test/DeLICM/reduction.ll b/polly/test/DeLICM/reduction.ll index 5d6531f51d570..29b7a3617300b 100644 --- a/polly/test/DeLICM/reduction.ll +++ b/polly/test/DeLICM/reduction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_constant_selfconflict.ll b/polly/test/DeLICM/reduction_constant_selfconflict.ll index 223a429d76343..012e0a0794b2b 100644 --- a/polly/test/DeLICM/reduction_constant_selfconflict.ll +++ b/polly/test/DeLICM/reduction_constant_selfconflict.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-flatten-schedule -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate.ll b/polly/test/DeLICM/reduction_looprotate.ll index b8eefe5e57cf8..341cc091f7e18 100644 --- a/polly/test/DeLICM/reduction_looprotate.ll +++ b/polly/test/DeLICM/reduction_looprotate.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll b/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll index 627a4452c3f90..a58eabb4fbd82 100644 --- a/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll +++ b/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; Verify that delicm can cope with never taken PHI incoming edges. ; The edge %body -> %body_phi is never taken, hence the access MemoryKind::PHI, diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre.ll index 1d3a789f7ce07..5a81441cf0eea 100644 --- a/polly/test/DeLICM/reduction_looprotate_gvnpre.ll +++ b/polly/test/DeLICM/reduction_looprotate_gvnpre.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-partial-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck -check-prefix=PARTIAL %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-partial-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck -check-prefix=PARTIAL %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll index 37499cd73020f..d9c5268e631df 100644 --- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll +++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Load (but not store) of A[j] hoisted, reduction only over some iterations. ; diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll index 79a700ff122e2..6a4223f5af655 100644 --- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll +++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Load (but not store) of A[j] hoisted, reduction not written in all iterations. ; FIXME: %join is not mapped because the MemoryKind::Value mapping does not diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll index 7e82daa9f80fc..bf4b8018d5526 100644 --- a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll +++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Hosted reduction load (but not the store) without preheader. ; diff --git a/polly/test/DeLICM/reduction_looprotate_hoisted.ll b/polly/test/DeLICM/reduction_looprotate_hoisted.ll index 7dc6e0fa9e408..795b94912aa42 100644 --- a/polly/test/DeLICM/reduction_looprotate_hoisted.ll +++ b/polly/test/DeLICM/reduction_looprotate_hoisted.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-invariant-load-hoisting '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-invariant-load-hoisting -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(int *A, int* StartPtr) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_licm.ll b/polly/test/DeLICM/reduction_looprotate_licm.ll index a9c55a8f54087..935f31abced30 100644 --- a/polly/test/DeLICM/reduction_looprotate_licm.ll +++ b/polly/test/DeLICM/reduction_looprotate_licm.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_licm2.ll b/polly/test/DeLICM/reduction_looprotate_licm2.ll index b98950b71bc85..8b06e7466f20a 100644 --- a/polly/test/DeLICM/reduction_looprotate_licm2.ll +++ b/polly/test/DeLICM/reduction_looprotate_licm2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s ; ; Use %phi instead of the normal %add; that is, the last last iteration will ; be ignored such the %phi cannot be written to A[3] in %body. diff --git a/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll b/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll index 4424d904b607d..51bb7291a73ed 100644 --- a/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll +++ b/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll @@ -1,4 +1,7 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule \ +; RUN: -polly-delicm-overapproximate-writes=true \ +; RUN: -polly-delicm-compute-known=true -polly-print-delicm \ +; RUN: -disable-output < %s | FileCheck %s ; ; Make sure delicm works even in case two stores that store the same value. ; diff --git a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll index 7d20b8d5c7cbf..027df44e86193 100644 --- a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll +++ b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s ; ; Register-promoted reduction but without preheader. ; diff --git a/polly/test/DeLICM/reduction_looprotate_load.ll b/polly/test/DeLICM/reduction_looprotate_load.ll index e288a86f30719..6aa83ae195031 100644 --- a/polly/test/DeLICM/reduction_looprotate_load.ll +++ b/polly/test/DeLICM/reduction_looprotate_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(int *A, double* StartPtr) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll index 4582f0a36eb5c..4ea3fa53a339a 100644 --- a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll +++ b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Reduction over parametric number of elements and a loopguard if the ; reduction loop is not executed at all. Load hoisted before loop. diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll index 7df2885e01339..2e7abe444ad65 100644 --- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll +++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Reduction over parametric number of elements and a loopguard if the ; reduction loop is not executed at all. diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll index a1bd5d3f90fe7..60afdeb5fc97e 100644 --- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll +++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Reduction over parametric number of elements and a loopguard if the ; reduction loop is not executed at all, such that A[j] is also not written to. diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll index 8329a85ecf13b..e63b457de92db 100644 --- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll +++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Reduction over parametric number of elements and a loopguard if the ; reduction loop is not executed at all, such that A[j] is also not accessed. diff --git a/polly/test/DeLICM/reduction_looprotate_readonly.ll b/polly/test/DeLICM/reduction_looprotate_readonly.ll index 5227f42ae4824..a9535467b3bde 100644 --- a/polly/test/DeLICM/reduction_looprotate_readonly.ll +++ b/polly/test/DeLICM/reduction_looprotate_readonly.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A, double Start) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_synthesizable.ll b/polly/test/DeLICM/reduction_looprotate_synthesizable.ll index 77d823c8ef6d5..3d486910c8612 100644 --- a/polly/test/DeLICM/reduction_looprotate_synthesizable.ll +++ b/polly/test/DeLICM/reduction_looprotate_synthesizable.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(int *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_looprotate_undef.ll b/polly/test/DeLICM/reduction_looprotate_undef.ll index f70df6075c2d3..8c0544ed77852 100644 --- a/polly/test/DeLICM/reduction_looprotate_undef.ll +++ b/polly/test/DeLICM/reduction_looprotate_undef.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(int *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_overapproximate.ll b/polly/test/DeLICM/reduction_overapproximate.ll index d6cbb70a84a4a..2d33d3a0ece2a 100644 --- a/polly/test/DeLICM/reduction_overapproximate.ll +++ b/polly/test/DeLICM/reduction_overapproximate.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=true -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=APPROX -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=false -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=EXACT -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=PARTIAL +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=true -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=APPROX +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=false -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=EXACT +; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=PARTIAL ; ; void func(double *A { ; for (int j = -1; j < 3; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_preheader.ll b/polly/test/DeLICM/reduction_preheader.ll index f3ce58b1bc954..c6e3643797c04 100644 --- a/polly/test/DeLICM/reduction_preheader.ll +++ b/polly/test/DeLICM/reduction_preheader.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reduction_unrelatedunusual.ll b/polly/test/DeLICM/reduction_unrelatedunusual.ll index 542cec71ab855..97826f603e5d4 100644 --- a/polly/test/DeLICM/reduction_unrelatedunusual.ll +++ b/polly/test/DeLICM/reduction_unrelatedunusual.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s ; ; Map %add and %phi to A[j]. ; The non-analyzable store to C[0] is unrelated and can be ignored. diff --git a/polly/test/DeLICM/reject_loadafterstore.ll b/polly/test/DeLICM/reject_loadafterstore.ll index d56b237aa71d9..4460620852a85 100644 --- a/polly/test/DeLICM/reject_loadafterstore.ll +++ b/polly/test/DeLICM/reject_loadafterstore.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reject_outofquota.ll b/polly/test/DeLICM/reject_outofquota.ll index 9b7f8e5f97af3..9bc6bf1f23733 100644 --- a/polly/test/DeLICM/reject_outofquota.ll +++ b/polly/test/DeLICM/reject_outofquota.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps;delicm>' -polly-print-deps -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-delicm,print<polly-dependences>' -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reject_storeafterstore.ll b/polly/test/DeLICM/reject_storeafterstore.ll index 0fea4d7bb3960..ddd13dad2ed31 100644 --- a/polly/test/DeLICM/reject_storeafterstore.ll +++ b/polly/test/DeLICM/reject_storeafterstore.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reject_storeinsubregion.ll b/polly/test/DeLICM/reject_storeinsubregion.ll index 0b75c16495c5c..c987156b51cd1 100644 --- a/polly/test/DeLICM/reject_storeinsubregion.ll +++ b/polly/test/DeLICM/reject_storeinsubregion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/reject_unusualstore.ll b/polly/test/DeLICM/reject_unusualstore.ll index 311a7351c955b..342888c6654f4 100644 --- a/polly/test/DeLICM/reject_unusualstore.ll +++ b/polly/test/DeLICM/reject_unusualstore.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-delicm -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; ; void func(double *A) { diff --git a/polly/test/DeLICM/skip_maywrite.ll b/polly/test/DeLICM/skip_maywrite.ll index 14de2b9d0bf84..0d30791cd94e7 100644 --- a/polly/test/DeLICM/skip_maywrite.ll +++ b/polly/test/DeLICM/skip_maywrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeLICM/skip_multiaccess.ll b/polly/test/DeLICM/skip_multiaccess.ll index a213a91343f3d..a7c79f7524630 100644 --- a/polly/test/DeLICM/skip_multiaccess.ll +++ b/polly/test/DeLICM/skip_multiaccess.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; llvm.org/PR34485 ; llvm.org/PR34989 diff --git a/polly/test/DeLICM/skip_notinloop.ll b/polly/test/DeLICM/skip_notinloop.ll index 3a2dede210083..8e265e19aefea 100644 --- a/polly/test/DeLICM/skip_notinloop.ll +++ b/polly/test/DeLICM/skip_notinloop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; double phi = 0.0; diff --git a/polly/test/DeLICM/skip_scalaraccess.ll b/polly/test/DeLICM/skip_scalaraccess.ll index a0ed9f76a8ca2..2cf13afe11cdf 100644 --- a/polly/test/DeLICM/skip_scalaraccess.ll +++ b/polly/test/DeLICM/skip_scalaraccess.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s ; ; void func(double *A) { ; for (int j = 0; j < 2; j += 1) { /* outer */ diff --git a/polly/test/DeadCodeElimination/chained_iterations.ll b/polly/test/DeadCodeElimination/chained_iterations.ll index f1e47075e2f74..f3bf07bb40d83 100644 --- a/polly/test/DeadCodeElimination/chained_iterations.ll +++ b/polly/test/DeadCodeElimination/chained_iterations.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; ; for(i = 0; i < 200; i++ ) diff --git a/polly/test/DeadCodeElimination/chained_iterations_2.ll b/polly/test/DeadCodeElimination/chained_iterations_2.ll index 6ecc07c0f7d21..52f034f0e56ca 100644 --- a/polly/test/DeadCodeElimination/chained_iterations_2.ll +++ b/polly/test/DeadCodeElimination/chained_iterations_2.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; ; for(i = 0; i < 200; i++ ) diff --git a/polly/test/DeadCodeElimination/computeout.ll b/polly/test/DeadCodeElimination/computeout.ll index b43142be2a5c8..e54df42ed1db0 100644 --- a/polly/test/DeadCodeElimination/computeout.ll +++ b/polly/test/DeadCodeElimination/computeout.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly '-passes=polly-custom<dce;ast>' -polly-print-ast < %s | FileCheck %s -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<dce;ast>' -polly-print-ast -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT +; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for(i = 0; i < 100; i++ ) diff --git a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll index 85eea91f99207..c102f60abb659 100644 --- a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll +++ b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll @@ -1,4 +1,4 @@ -; RUN: opt -S %loadNPMPolly '-passes=polly-custom<dce;ast>' -polly-print-ast -polly-dependences-analysis-type=value-based -polly-dce-precise-steps=2 < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-analysis-type=value-based -polly-dce-precise-steps=2 < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; ; for(i = 0; i < 200; i++ ) diff --git a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll index 21b7c5cf9583b..36f55476fed23 100644 --- a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll +++ b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s ; ; void f(int *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/DeadCodeElimination/non-affine.ll b/polly/test/DeadCodeElimination/non-affine.ll index 86cabe6501393..ef528b4124c66 100644 --- a/polly/test/DeadCodeElimination/non-affine.ll +++ b/polly/test/DeadCodeElimination/non-affine.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s ; ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1) ; diff --git a/polly/test/DeadCodeElimination/null_schedule.ll b/polly/test/DeadCodeElimination/null_schedule.ll index 507d690144e01..01d34e95629ba 100644 --- a/polly/test/DeadCodeElimination/null_schedule.ll +++ b/polly/test/DeadCodeElimination/null_schedule.ll @@ -1,4 +1,4 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; A[0] = 1; ; diff --git a/polly/test/DependenceInfo/computeout.ll b/polly/test/DependenceInfo/computeout.ll index 3fdc4008f8474..c2a3456b3dc80 100644 --- a/polly/test/DependenceInfo/computeout.ll +++ b/polly/test/DependenceInfo/computeout.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE -; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT +; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE +; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for(i = 0; i < 100; i++ ) diff --git a/polly/test/DependenceInfo/different_schedule_dimensions.ll b/polly/test/DependenceInfo/different_schedule_dimensions.ll index 69274f11f567f..f89791f42f9db 100644 --- a/polly/test/DependenceInfo/different_schedule_dimensions.ll +++ b/polly/test/DependenceInfo/different_schedule_dimensions.ll @@ -1,4 +1,5 @@ -; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' \ +; RUN: -disable-output < %s | FileCheck %s ; CHECK: RAW dependences: ; CHECK: { Stmt_bb9[0] -> Stmt_bb10[0] } diff --git a/polly/test/DependenceInfo/do_pluto_matmult.ll b/polly/test/DependenceInfo/do_pluto_matmult.ll index 2a0027bbc034b..b88cf9bf5475c 100644 --- a/polly/test/DependenceInfo/do_pluto_matmult.ll +++ b/polly/test/DependenceInfo/do_pluto_matmult.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/DependenceInfo/fine_grain_dep_0.ll b/polly/test/DependenceInfo/fine_grain_dep_0.ll index 06a196822c832..5abbf48136891 100644 --- a/polly/test/DependenceInfo/fine_grain_dep_0.ll +++ b/polly/test/DependenceInfo/fine_grain_dep_0.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC ; REF: RAW dependences: ; REF-NEXT: [N] -> { [Stmt_for_body[i0] -> MemRef_b[]] -> [Stmt_for_body[6 + i0] -> MemRef_b[]] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[6 + i0] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[4 + i0] : 0 <= i0 <= -11 + N; [Stmt_for_body[i0] -> MemRef_a[]] -> [Stmt_for_body[4 + i0] -> MemRef_a[]] : 0 <= i0 <= -11 + N } diff --git a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll index 9875257694331..677323495476b 100644 --- a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll +++ b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll @@ -1,4 +1,4 @@ -; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE +; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; for (int i = 0; i < N; i++) { diff --git a/polly/test/DependenceInfo/infeasible_context.ll b/polly/test/DependenceInfo/infeasible_context.ll index c9473e614e362..cde3102dc3dc9 100644 --- a/polly/test/DependenceInfo/infeasible_context.ll +++ b/polly/test/DependenceInfo/infeasible_context.ll @@ -1,5 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=FUNC-SCOP -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-detect -polly-print-deps -disable-output < %s 2>&1 | FileCheck %s -check-prefix=FUNC-DEPS +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s -check-prefix=FUNC-SCOP +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(print<polly-dependences>)' -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s -check-prefix=FUNC-DEPS ; ; FUNC-SCOP-NOT: Statement ; FUNC-DEPS-NOT: RAW dependences diff --git a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll index 92e6cb89b2a27..392a34769cddb 100644 --- a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll +++ b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; Verify that the presence of a may-write (S1) between a read (S0) and a ; must-write (S2) does not block the generation of RAW dependences. This makes diff --git a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll index b14759725dde0..ae5fd3beed399 100644 --- a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll +++ b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; CHECK: MayWriteAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/DependenceInfo/reduction_complex_location.ll b/polly/test/DependenceInfo/reduction_complex_location.ll index 45789088e57e4..7722ee974c3fa 100644 --- a/polly/test/DependenceInfo/reduction_complex_location.ll +++ b/polly/test/DependenceInfo/reduction_complex_location.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { } diff --git a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll index 7923975118bb9..840d1f32dca39 100644 --- a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll +++ b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; This loopnest contains a reduction which imposes the same dependences as the ; accesses to the array A. We need to ensure we keep the dependences of A. diff --git a/polly/test/DependenceInfo/reduction_dependences_not_null.ll b/polly/test/DependenceInfo/reduction_dependences_not_null.ll index fdcd5f311800d..56d84a9aec6d6 100644 --- a/polly/test/DependenceInfo/reduction_dependences_not_null.ll +++ b/polly/test/DependenceInfo/reduction_dependences_not_null.ll @@ -1,7 +1,7 @@ ; Test that the reduction dependences are always initialised, even in a case ; where we have no reduction. If this object is NULL, then isl operations on ; it will fail. -; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE +; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for(i = 0; i < 100; i++ ) diff --git a/polly/test/DependenceInfo/reduction_indirect_access.ll b/polly/test/DependenceInfo/reduction_indirect_access.ll index 13675ada39b0e..3b4bd9ef04b5a 100644 --- a/polly/test/DependenceInfo/reduction_indirect_access.ll +++ b/polly/test/DependenceInfo/reduction_indirect_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-allow-nonaffine -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s ; ; CHECK: Reduction dependences: ; CHECK: [N] -> { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : 0 <= i0 <= -2 + N } diff --git a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll index e6ce425719ca9..76c7fc64ae89c 100644 --- a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll +++ b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0 + i1, o1] : i0 >= 0 and 0 <= i1 <= 1023 - i0 and i1 <= 1 and 0 < o1 <= 511 } diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll index 820371937a582..02b814a0d7c04 100644 --- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll +++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll @@ -1,6 +1,6 @@ -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s ; ; Verify that only the inner reduction like accesses cause reduction dependences ; diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll index 9792f791c6989..91bd35deebd06 100644 --- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll +++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { } diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll index 9bde285c64516..040d513782392 100644 --- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll +++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s ; ; CHECK: Reduction dependences: ; CHECK-NEXT: { Stmt_for_inc[i0, i1] -> Stmt_for_inc[i0, 1 + i1] : 0 <= i0 <= 99 and 0 <= i1 <= 98 } diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions.ll b/polly/test/DependenceInfo/reduction_multiple_reductions.ll index ac3adb9065462..527a8cfc3556e 100644 --- a/polly/test/DependenceInfo/reduction_multiple_reductions.ll +++ b/polly/test/DependenceInfo/reduction_multiple_reductions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; Verify we do not have dependences between the if and the else clause ; diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll index 16ca85bff9502..fb5fd96a2e426 100644 --- a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll +++ b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; ; These are the important RAW dependences, as they need to originate/end in only one iteration: diff --git a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll index de506a39485cc..3ec3920268b49 100644 --- a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll +++ b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; FIXME: Change the comment once we allow different pointers ; The statement is "almost" reduction like but should not yield any reduction dependences diff --git a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll index fbf1409a1ba30..23bd8ef25bd7a 100644 --- a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll +++ b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s ; ; CHECK: Reduction dependences: ; CHECK-NEXT: [N] -> { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0, 1 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and 1024 - N + i0 <= i1 <= 1022 } diff --git a/polly/test/DependenceInfo/reduction_privatization_deps.ll b/polly/test/DependenceInfo/reduction_privatization_deps.ll index 0d66f885cd42d..0e0f71737ffd3 100644 --- a/polly/test/DependenceInfo/reduction_privatization_deps.ll +++ b/polly/test/DependenceInfo/reduction_privatization_deps.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_S1[i0, i1] -> Stmt_S2[-1 + i0 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and -i0 < i1 <= 1024 - i0 and i1 <= 1023; Stmt_S0[i0] -> Stmt_S1[o0, i0 - o0] : i0 <= 1023 and 0 <= o0 <= i0 } diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll index 81235d6cf02e4..cafa319e2cc7b 100644 --- a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll +++ b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; We have privatization dependences from a textually later statement to a ; textually earlier one, but the dependences still go forward in time. diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll index 6b48ab5afd155..d86da92fbcab8 100644 --- a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll +++ b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_S1[i0] -> Stmt_S3[2 + i0] : 0 <= i0 <= 96; Stmt_S2[i0, i1] -> Stmt_S3[o0] : i1 <= 1 - i0 and -i1 < o0 <= 1 and o0 <= 1 + i0 - i1; Stmt_S3[i0] -> Stmt_S2[o0, 1 - i0] : 0 <= i0 <= 1 and i0 < o0 <= 98 } diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll index 1fef004c4c47a..d84c04fc309b0 100644 --- a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll +++ b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_S1[i0] -> Stmt_S2[i0, i0] : 0 <= i0 <= 98; Stmt_S2[i0, i0] -> Stmt_S3[i0] : 0 <= i0 <= 98; Stmt_S3[i0] -> Stmt_S2[o0, i0] : i0 >= 0 and i0 < o0 <= 98; Stmt_S2[i0, i1] -> Stmt_S1[i1] : i0 >= 0 and i0 < i1 <= 98 } diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll index f40a7c07a3ba4..592c7238c3c59 100644 --- a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll +++ b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_S1[i0, 0] -> Stmt_S2[i0, 0] : 0 <= i0 <= 98; Stmt_S2[i0, 0] -> Stmt_S1[1 + i0, 0] : 0 <= i0 <= 97 } diff --git a/polly/test/DependenceInfo/reduction_sequence.ll b/polly/test/DependenceInfo/reduction_sequence.ll index d881a99adc226..7ce9d37d395bb 100644 --- a/polly/test/DependenceInfo/reduction_sequence.ll +++ b/polly/test/DependenceInfo/reduction_sequence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; void manyreductions(long *A) { ; for (long i = 0; i < 1024; i++) diff --git a/polly/test/DependenceInfo/reduction_simple_iv.ll b/polly/test/DependenceInfo/reduction_simple_iv.ll index b811d1593ab02..d13d14ecaad92 100644 --- a/polly/test/DependenceInfo/reduction_simple_iv.ll +++ b/polly/test/DependenceInfo/reduction_simple_iv.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { } diff --git a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll index 0a5d36f9b9f79..4c97fbb1aacb7 100644 --- a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll +++ b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s ; ; REQUIRES: asserts ; diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll index 90f9d76ef57b2..804005cf72a72 100644 --- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll +++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { Stmt_S1[i0, i1] -> Stmt_S2[i0] : 0 <= i0 <= 99 and 0 <= i1 <= 99; Stmt_S0[i0] -> Stmt_S1[i0, o1] : 0 <= i0 <= 99 and 0 <= o1 <= 99; Stmt_S2[i0] -> Stmt_S0[1 + i0] : 0 <= i0 <= 98 } diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll index 2b194bbb51988..9596827b4cbbf 100644 --- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll +++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: [N] -> { Stmt_S1[i0] -> Stmt_S2[] : N >= 11 and 0 <= i0 <= 1023; Stmt_S0[] -> Stmt_S1[o0] : N >= 11 and 0 <= o0 <= 1023 } diff --git a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll index 70d5bdf64059d..d67683d11a4b3 100644 --- a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll +++ b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s ; ; CHECK: RAW dependences: ; CHECK-NEXT: { } diff --git a/polly/test/DependenceInfo/sequential_loops.ll b/polly/test/DependenceInfo/sequential_loops.ll index 023c2d4f29f37..6ae7200303321 100644 --- a/polly/test/DependenceInfo/sequential_loops.ll +++ b/polly/test/DependenceInfo/sequential_loops.ll @@ -1,6 +1,6 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS ; VALUE: RAW dependences: ; VALUE-NEXT: { } diff --git a/polly/test/FlattenSchedule/gemm.ll b/polly/test/FlattenSchedule/gemm.ll index 11dc40599bb0e..b20293bd315a3 100644 --- a/polly/test/FlattenSchedule/gemm.ll +++ b/polly/test/FlattenSchedule/gemm.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten>' -polly-print-flatten-schedule -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-flatten-schedule -disable-output < %s | FileCheck %s ; ; dgemm kernel ; C := alpha*A*B + beta*C diff --git a/polly/test/ForwardOpTree/atax.ll b/polly/test/ForwardOpTree/atax.ll index 3dfe3fa0aa8e6..6c81fb12e8cdc 100644 --- a/polly/test/ForwardOpTree/atax.ll +++ b/polly/test/ForwardOpTree/atax.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ForwardOpTree/changed-kind.ll b/polly/test/ForwardOpTree/changed-kind.ll index ec8869da3ae57..b9081f3734044 100644 --- a/polly/test/ForwardOpTree/changed-kind.ll +++ b/polly/test/ForwardOpTree/changed-kind.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; In the code below, %0 is known to be equal to the content of @c (constant 0). ; Thus, in order to save a scalar dependency, forward-optree replaces diff --git a/polly/test/ForwardOpTree/forward_from_region.ll b/polly/test/ForwardOpTree/forward_from_region.ll index de47bc4df0076..767a580dccf95 100644 --- a/polly/test/ForwardOpTree/forward_from_region.ll +++ b/polly/test/ForwardOpTree/forward_from_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Move instructions from region statements. ; diff --git a/polly/test/ForwardOpTree/forward_hoisted.ll b/polly/test/ForwardOpTree/forward_hoisted.ll index 39f99545b01ac..5d0b0a884b761 100644 --- a/polly/test/ForwardOpTree/forward_hoisted.ll +++ b/polly/test/ForwardOpTree/forward_hoisted.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify). ; This involves making the load-hoisted %val1 to be made available in %bodyB. diff --git a/polly/test/ForwardOpTree/forward_instruction.ll b/polly/test/ForwardOpTree/forward_instruction.ll index a9f5d3d85ac0a..50a9b07b8a05b 100644 --- a/polly/test/ForwardOpTree/forward_instruction.ll +++ b/polly/test/ForwardOpTree/forward_instruction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify) ; diff --git a/polly/test/ForwardOpTree/forward_into_region.ll b/polly/test/ForwardOpTree/forward_into_region.ll index 2279a89cfaeb7..ef71b11dc5716 100644 --- a/polly/test/ForwardOpTree/forward_into_region.ll +++ b/polly/test/ForwardOpTree/forward_into_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Move instructions to region statements. ; diff --git a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll index f7901e1ccf8fd..1c585446ae63a 100644 --- a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll +++ b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; define void @foo(ptr %A, i32 %p, ptr %B) { diff --git a/polly/test/ForwardOpTree/forward_load.ll b/polly/test/ForwardOpTree/forward_load.ll index 860e603ef47d2..0bba41833fb19 100644 --- a/polly/test/ForwardOpTree/forward_load.ll +++ b/polly/test/ForwardOpTree/forward_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-optree>)" -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load. ; diff --git a/polly/test/ForwardOpTree/forward_load_differentarray.ll b/polly/test/ForwardOpTree/forward_load_differentarray.ll index 24b008cfae384..364bf3ef37133 100644 --- a/polly/test/ForwardOpTree/forward_load_differentarray.ll +++ b/polly/test/ForwardOpTree/forward_load_differentarray.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; To forward %val, B[j] cannot be reused in bodyC because it is overwritten ; between. Verify that instead the alternative C[j] is used. diff --git a/polly/test/ForwardOpTree/forward_load_double_write.ll b/polly/test/ForwardOpTree/forward_load_double_write.ll index 522e803b2d0a0..4c30c7f8da56f 100644 --- a/polly/test/ForwardOpTree/forward_load_double_write.ll +++ b/polly/test/ForwardOpTree/forward_load_double_write.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load even in case two writes of identical values are in ; one scop statement. diff --git a/polly/test/ForwardOpTree/forward_load_fromloop.ll b/polly/test/ForwardOpTree/forward_load_fromloop.ll index 5c64221d882b9..1494e872a8942 100644 --- a/polly/test/ForwardOpTree/forward_load_fromloop.ll +++ b/polly/test/ForwardOpTree/forward_load_fromloop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Forward a the LoadInst %val into %bodyB. %val is executed multiple times, ; we must get the last loaded values. diff --git a/polly/test/ForwardOpTree/forward_load_indirect.ll b/polly/test/ForwardOpTree/forward_load_indirect.ll index 5b06c357f02ba..51ce94d267277 100644 --- a/polly/test/ForwardOpTree/forward_load_indirect.ll +++ b/polly/test/ForwardOpTree/forward_load_indirect.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Forward an operand tree consisting of a speculatable instruction (%add) ; and a load (%val). diff --git a/polly/test/ForwardOpTree/forward_load_memset_after.ll b/polly/test/ForwardOpTree/forward_load_memset_after.ll index b889783d531e6..bd2cad411eccf 100644 --- a/polly/test/ForwardOpTree/forward_load_memset_after.ll +++ b/polly/test/ForwardOpTree/forward_load_memset_after.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load in the presence of a non-store WRITE access. ; diff --git a/polly/test/ForwardOpTree/forward_load_memset_before.ll b/polly/test/ForwardOpTree/forward_load_memset_before.ll index c8f0e0e5814fb..3e89dea37775c 100644 --- a/polly/test/ForwardOpTree/forward_load_memset_before.ll +++ b/polly/test/ForwardOpTree/forward_load_memset_before.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load in the presence of a non-store WRITE access. ; diff --git a/polly/test/ForwardOpTree/forward_load_tripleuse.ll b/polly/test/ForwardOpTree/forward_load_tripleuse.ll index df57bf70cc53b..7526a8313945d 100644 --- a/polly/test/ForwardOpTree/forward_load_tripleuse.ll +++ b/polly/test/ForwardOpTree/forward_load_tripleuse.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>,polly-codegen' -disable-output < %s | FileCheck %s -match-full-lines ; ; %val1 is used three times: Twice by its own operand tree of %val2 and once ; more by the store in %bodyB. diff --git a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll index ba84a1a16748f..daf289d8b0da1 100644 --- a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll +++ b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load. ; The non-analyzable store to C[0] is unrelated and can be ignored. diff --git a/polly/test/ForwardOpTree/forward_phi_load.ll b/polly/test/ForwardOpTree/forward_phi_load.ll index c763af4269c89..1457aa96e2de7 100644 --- a/polly/test/ForwardOpTree/forward_phi_load.ll +++ b/polly/test/ForwardOpTree/forward_phi_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load. ; diff --git a/polly/test/ForwardOpTree/forward_readonly.ll b/polly/test/ForwardOpTree/forward_readonly.ll index 69c7f10be4e56..646121c4efeff 100644 --- a/polly/test/ForwardOpTree/forward_readonly.ll +++ b/polly/test/ForwardOpTree/forward_readonly.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL -; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL +; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL +; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL ; ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify) ; diff --git a/polly/test/ForwardOpTree/forward_reusue.ll b/polly/test/ForwardOpTree/forward_reusue.ll index e39e7b51dc689..d8ad31782ecb9 100644 --- a/polly/test/ForwardOpTree/forward_reusue.ll +++ b/polly/test/ForwardOpTree/forward_reusue.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Move operand tree without duplicating values used multiple times. ; diff --git a/polly/test/ForwardOpTree/forward_store.ll b/polly/test/ForwardOpTree/forward_store.ll index 8cd6e2446ff93..17cb8b395eb30 100644 --- a/polly/test/ForwardOpTree/forward_store.ll +++ b/polly/test/ForwardOpTree/forward_store.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Rematerialize a load. ; diff --git a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll index f70965f3c5d1b..57b68180bb121 100644 --- a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll +++ b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Copy %val to bodyB, assuming the exit value of %i. ; diff --git a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll index c95c45856ac36..b4828e4c2c423 100644 --- a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll +++ b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Test support for (synthesizable) inducation variables. ; diff --git a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll index 14fb8d8dcc0ab..3228bb60d2ca2 100644 --- a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll +++ b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Synthesizable values defined outside of a loop can be used ; inside the loop. diff --git a/polly/test/ForwardOpTree/forward_transitive.ll b/polly/test/ForwardOpTree/forward_transitive.ll index 7b55d9e0cf9b2..aacf1358648f5 100644 --- a/polly/test/ForwardOpTree/forward_transitive.ll +++ b/polly/test/ForwardOpTree/forward_transitive.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Move %v and %val to %bodyB, so %bodyA can be removed (by -polly-simplify) ; diff --git a/polly/test/ForwardOpTree/jacobi-1d.ll b/polly/test/ForwardOpTree/jacobi-1d.ll index 3bc504d88c0eb..cb035bb749c7b 100644 --- a/polly/test/ForwardOpTree/jacobi-1d.ll +++ b/polly/test/ForwardOpTree/jacobi-1d.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ForwardOpTree/noforward_from_region.ll b/polly/test/ForwardOpTree/noforward_from_region.ll index 0729241c3f7d9..bd5864c25f543 100644 --- a/polly/test/ForwardOpTree/noforward_from_region.ll +++ b/polly/test/ForwardOpTree/noforward_from_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Ensure we do not move instructions from region statements in case the ; instruction to move loads from an array which is also written to from diff --git a/polly/test/ForwardOpTree/noforward_load_conditional.ll b/polly/test/ForwardOpTree/noforward_load_conditional.ll index d33ef99ae6bed..5474e740de800 100644 --- a/polly/test/ForwardOpTree/noforward_load_conditional.ll +++ b/polly/test/ForwardOpTree/noforward_load_conditional.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; B[j] is overwritten by at least one statement between the ; definition of %val and its use. Hence, it cannot be forwarded. diff --git a/polly/test/ForwardOpTree/noforward_load_writebetween.ll b/polly/test/ForwardOpTree/noforward_load_writebetween.ll index e7deb381de87a..697c940be4fdd 100644 --- a/polly/test/ForwardOpTree/noforward_load_writebetween.ll +++ b/polly/test/ForwardOpTree/noforward_load_writebetween.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Cannot rematerialize %val from B[0] at bodyC because B[0] has been ; overwritten in bodyB. diff --git a/polly/test/ForwardOpTree/noforward_outofquota.ll b/polly/test/ForwardOpTree/noforward_outofquota.ll index 5e30cf88de4cf..306bb8d7558d1 100644 --- a/polly/test/ForwardOpTree/noforward_outofquota.ll +++ b/polly/test/ForwardOpTree/noforward_outofquota.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=polly-custom<optree>' -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS +; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 -passes=polly-optree -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS ; REQUIRES: asserts ; ; for (int j = 0; j < n; j += 1) { diff --git a/polly/test/ForwardOpTree/noforward_partial.ll b/polly/test/ForwardOpTree/noforward_partial.ll index f95bb77f70b67..edb5d34801cc5 100644 --- a/polly/test/ForwardOpTree/noforward_partial.ll +++ b/polly/test/ForwardOpTree/noforward_partial.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Not the entire operand tree can be forwarded, ; some scalar dependencies would remain. diff --git a/polly/test/ForwardOpTree/noforward_phi.ll b/polly/test/ForwardOpTree/noforward_phi.ll index 025fe64724151..755abad4336ef 100644 --- a/polly/test/ForwardOpTree/noforward_phi.ll +++ b/polly/test/ForwardOpTree/noforward_phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Do not move PHI nodes. ; diff --git a/polly/test/ForwardOpTree/noforward_selfrefphi.ll b/polly/test/ForwardOpTree/noforward_selfrefphi.ll index 8b30137858243..be7e82f726331 100644 --- a/polly/test/ForwardOpTree/noforward_selfrefphi.ll +++ b/polly/test/ForwardOpTree/noforward_selfrefphi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Contains a self-referencing PHINode that would require a ; transitive closure to handle. diff --git a/polly/test/ForwardOpTree/noforward_sideffects.ll b/polly/test/ForwardOpTree/noforward_sideffects.ll index 179b02a259025..c01b72a1c1420 100644 --- a/polly/test/ForwardOpTree/noforward_sideffects.ll +++ b/polly/test/ForwardOpTree/noforward_sideffects.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Do not forward instructions with side-effects (here: function call). ; diff --git a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll index 6baec6d9e1c6c..776d848072a23 100644 --- a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll +++ b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Do not try to forward %i.trunc, it is not synthesizable in %body. ; diff --git a/polly/test/ForwardOpTree/out-of-quota1.ll b/polly/test/ForwardOpTree/out-of-quota1.ll index 95df49a5c061a..ee3e32698dd02 100644 --- a/polly/test/ForwardOpTree/out-of-quota1.ll +++ b/polly/test/ForwardOpTree/out-of-quota1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output %s | FileCheck %s ; This used to loop infinitely because of UINT_MAX returned by ISL on out-of-quota. diff --git a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll index a5102b3557f0c..ec1ccdce94508 100644 --- a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s ; ; void jd(int *A) { ; CHECK: #pragma omp parallel for diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll index d086b59f97a5a..9c00690605408 100644 --- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < 1024; i++) diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll index 49a6b0531de56..356762a2ae5b9 100644 --- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll +++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; int A[1024][1024]; ; void bar(int n) { diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll index d2d7917b08528..066fc39def6ac 100644 --- a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < n; i++) diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll index c03189a211256..77dd55cb7605e 100644 --- a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < n; i++) diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll index 6829211cc76b9..b61ebc9379b7f 100644 --- a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < n; i++) diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll index 7199a337d8a4f..5c92a91681867 100644 --- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll +++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < n; i++) diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll index 41d35bfdb3631..352d879199675 100644 --- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll +++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for (i = 0; i < n; i++) diff --git a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll index 356269cefad36..81c29536010b6 100644 --- a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll +++ b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \ +; RUN: | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/IstAstInfo/alias_simple_1.ll b/polly/test/IstAstInfo/alias_simple_1.ll index 039c5f74fabfe..904f55dc32ce4 100644 --- a/polly/test/IstAstInfo/alias_simple_1.ll +++ b/polly/test/IstAstInfo/alias_simple_1.ll @@ -1,8 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB ; ; int A[1024]; ; diff --git a/polly/test/IstAstInfo/alias_simple_2.ll b/polly/test/IstAstInfo/alias_simple_2.ll index 1783a04f02be9..5fae579995b23 100644 --- a/polly/test/IstAstInfo/alias_simple_2.ll +++ b/polly/test/IstAstInfo/alias_simple_2.ll @@ -1,9 +1,9 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE ; ; int A[1024], B[1024]; ; diff --git a/polly/test/IstAstInfo/alias_simple_3.ll b/polly/test/IstAstInfo/alias_simple_3.ll index 8d507fb82cb2d..8599c29934744 100644 --- a/polly/test/IstAstInfo/alias_simple_3.ll +++ b/polly/test/IstAstInfo/alias_simple_3.ll @@ -1,8 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB ; ; int A[1024]; ; float B[1024]; diff --git a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll index 01b5372917358..dc21dc1f96a48 100644 --- a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll +++ b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output -polly-invariant-load-hoisting < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: | FileCheck %s ; CHECK: if (1 && 1 && (&MemRef_X[1] <= &MemRef_BaseA[0] || &MemRef_BaseA[1024] <= &MemRef_X[0]) && (&MemRef_X[1] <= &MemRef_BaseB[0] || &MemRef_BaseB[1024] <= &MemRef_X[0])) diff --git a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll index 3835c23fecddb..8d4adfa405f07 100644 --- a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll +++ b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA ; ; void jd(int *Int0, int *Int1, float *Float0, float *Float1) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll index 71bac9a2bb141..be37b27b6e375 100644 --- a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll +++ b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output %s | FileCheck %s ; ; void jd(int *A, int *B, int c) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll index e5ece1f57a85e..15550583340db 100644 --- a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll +++ b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ; ; void jd(int *A, int *B, int c) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/IstAstInfo/dependence_distance_constant.ll b/polly/test/IstAstInfo/dependence_distance_constant.ll index 43b13eef9a95b..9b7fb93f2f676 100644 --- a/polly/test/IstAstInfo/dependence_distance_constant.ll +++ b/polly/test/IstAstInfo/dependence_distance_constant.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *A, int N) { ; CHECK: #pragma minimal dependence distance: 1 diff --git a/polly/test/IstAstInfo/dependence_distance_minimal.ll b/polly/test/IstAstInfo/dependence_distance_minimal.ll index 35a503ce7eb8d..d69cc3f9fc3f8 100644 --- a/polly/test/IstAstInfo/dependence_distance_minimal.ll +++ b/polly/test/IstAstInfo/dependence_distance_minimal.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; The minimal dependence distance of the innermost loop should be 1 instead of 250. ; CHECK: #pragma minimal dependence distance: 1 diff --git a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll index a7de5c4876385..bc21e9e07ad89 100644 --- a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll +++ b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *restrict A, int *restrict B, int N) { ; CHECK: #pragma minimal dependence distance: 5 diff --git a/polly/test/IstAstInfo/dependence_distance_parametric.ll b/polly/test/IstAstInfo/dependence_distance_parametric.ll index fa05e4c889031..fa569a8386b86 100644 --- a/polly/test/IstAstInfo/dependence_distance_parametric.ll +++ b/polly/test/IstAstInfo/dependence_distance_parametric.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *A, int N, int c) { ; CHECK: #pragma minimal dependence distance: 1 diff --git a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll index 73f74b3bce0b1..7f280e0c542ca 100644 --- a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll +++ b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *A, int N, int c, int v) { ; CHECK: #pragma minimal dependence distance: 1 diff --git a/polly/test/IstAstInfo/dependence_distance_varying.ll b/polly/test/IstAstInfo/dependence_distance_varying.ll index e908954536600..d609c2f210f8d 100644 --- a/polly/test/IstAstInfo/dependence_distance_varying.ll +++ b/polly/test/IstAstInfo/dependence_distance_varying.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *A, int N) { ; CHECK: #pragma minimal dependence distance: -(N % 2) + 2 diff --git a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll index 1668fc0515441..8ed3220353c1b 100644 --- a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll +++ b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-canonicalize -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *restrict A, int *restrict sum) { ; CHECK: #pragma minimal dependence distance: 1 diff --git a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll index 0d0aa8bea31d8..73768e9c308a4 100644 --- a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll +++ b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; void f(int *restrict A, int *restrict B, int *restrict C, int *restrict D, ; int *restrict E, int N) { diff --git a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll index 2ed94e59e8087..e2cf0bd9c0df2 100644 --- a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll +++ b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ; CHECK: { ; CHECK-NEXT: if (p <= -1 || p >= 1) diff --git a/polly/test/IstAstInfo/non_affine_access.ll b/polly/test/IstAstInfo/non_affine_access.ll index a285a8f032f5e..98e8d2db959f8 100644 --- a/polly/test/IstAstInfo/non_affine_access.ll +++ b/polly/test/IstAstInfo/non_affine_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s ; ; void non_affine_access(float A[]) { ; for (long i = 0; i < 1024; i++) diff --git a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll index 3fefc74efbef0..697b6ca50d444 100644 --- a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll +++ b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum) ; void f(int N, int M, int P, int sum[P][M]) { diff --git a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll index 41bd178c73c2a..c20a7d6db13c9 100644 --- a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll +++ b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum) ; void f(int N, int M, int *sum) { diff --git a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll index 5aa8a0c244423..e6092f0b068f8 100644 --- a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll +++ b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; This loopnest contains a reduction which imposes the same dependences as the ; accesses to the array A. We need to ensure we do __not__ parallelize anything diff --git a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll index 91f7c9d9601bc..14de70f9357c3 100644 --- a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll +++ b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma simd reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and) ; CHECK: #pragma known-parallel reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and) diff --git a/polly/test/IstAstInfo/reduction_in_one_dimension.ll b/polly/test/IstAstInfo/reduction_in_one_dimension.ll index d0173bcd978ca..797115b6f8d70 100644 --- a/polly/test/IstAstInfo/reduction_in_one_dimension.ll +++ b/polly/test/IstAstInfo/reduction_in_one_dimension.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; Verify that we won't privatize anything in the outer dimension ; diff --git a/polly/test/IstAstInfo/reduction_loop_reversal.ll b/polly/test/IstAstInfo/reduction_loop_reversal.ll index d010e26f739a6..d30119787d8e0 100644 --- a/polly/test/IstAstInfo/reduction_loop_reversal.ll +++ b/polly/test/IstAstInfo/reduction_loop_reversal.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK-NOT: #pragma simd{{\s*$}} ; CHECK: #pragma simd reduction diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll index 7f78badfcb93c..15fca884c2b63 100644 --- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll +++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel reduction (+ : MemRef_A) ; CHECK-NEXT: for (int c0 = 0; c0 <= 2; c0 += 1) { diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll index 42e9c3b19eb1b..44e9aa4d1e569 100644 --- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll +++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel reduction ; CHECK: for (int c0 = 0; c0 <= 2; c0 += 1) { diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_schedule.ll index 8bdd5299986eb..c39ffa591484d 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel reduction (+ : MemRef_A) ; CHECK-NEXT: for (int c0 = 0; c0 <= 2; c0 += 1) { diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll index 4811069e4f399..266753555cab1 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK: #pragma known-parallel ; CHECK: for (int c0 = 0; c0 <= 1; c0 += 1) diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll index 4f5ac24a0b005..d7f9029fd347a 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; Verify that the outer dimension doesn't carry reduction dependences ; diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll index 472a04847ec95..f18060a2e20a8 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; Verify that the outer dimension doesn't carry reduction dependences ; diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll index 2cc911d78234b..8e2a590c5f57c 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; Verify that the outer dimension doesn't carry reduction dependences ; diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll index 1b2d0eb75c12c..b889db4819cd5 100644 --- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll +++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; Verify that only the outer dimension needs privatization ; diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll index 884cea7918031..2a8fd7a4f670e 100644 --- a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll +++ b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll index 013a7d4f3ad27..25f2fa597e34e 100644 --- a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll +++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll index 2dc6d8680b36a..0d6be9a9da9bf 100644 --- a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll +++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll index dcd75945d25a8..8b537513cc8d7 100644 --- a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll +++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s ; ; CHECK-NOT:#pragma known-parallel reduction ; CHECK: #pragma known-parallel diff --git a/polly/test/IstAstInfo/run-time-condition.ll b/polly/test/IstAstInfo/run-time-condition.ll index 67fc4b74571da..44d3534f651ce 100644 --- a/polly/test/IstAstInfo/run-time-condition.ll +++ b/polly/test/IstAstInfo/run-time-condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ; for (i = 0; i < 1024; i++) ; A[i] = B[i]; diff --git a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll index d674f429c0d48..aef509a865b6a 100644 --- a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll +++ b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s ; ; Verify we do not simplify the runtime check to "true" due to the domain ; constraints as the test contains an error block that influenced the domains diff --git a/polly/test/IstAstInfo/simple-run-time-condition.ll b/polly/test/IstAstInfo/simple-run-time-condition.ll index 73a7c596cea0b..488cd180b899a 100644 --- a/polly/test/IstAstInfo/simple-run-time-condition.ll +++ b/polly/test/IstAstInfo/simple-run-time-condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/polly/test/IstAstInfo/single_loop_strip_mine.ll b/polly/test/IstAstInfo/single_loop_strip_mine.ll index f546972fb370c..afe6179188c01 100644 --- a/polly/test/IstAstInfo/single_loop_strip_mine.ll +++ b/polly/test/IstAstInfo/single_loop_strip_mine.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-ast-print-accesses -polly-ast-detect-parallel '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-ast-print-accesses -polly-ast-detect-parallel '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR ; for (i = 0; i < 1024; i++) ; A[i] = B[i]; diff --git a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll index c9ae9e8f4e52e..f614f90fc3fc9 100644 --- a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll +++ b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ; XFAIL: * ;#include "limits.h" diff --git a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll index 45227160e8699..e91ea13278692 100644 --- a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll +++ b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s ; XFAIL: * ;#include "limits.h" diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll index 28b6a7ca12799..49a962592bb9d 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: expecting other token ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll index f19a632815795..749b962b260f5 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: Statement from JScop file has no key name 'accesses' for index 1. ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll index 77b9acfbb0989..1d97e3ebca625 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: The number of memory accesses in the JSop file and the number of memory accesses differ for index 0. ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll index 0a06ff671c298..f4b739398f9f6 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: The number of indices and the number of statements differ. ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll index 35b7af098ae42..1f5cda3518a2f 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: Memory access number 0 has no key name 'relation' for statement number 1. ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll index 109665a85c607..0c750849b51eb 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: JScop file has no key name 'statements'. ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll index f345d1c31796e..d8c9c3f4ab2ea 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: JScop file contains access function with undeclared ScopArrayInfo ; diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll index a66d5c8c69b55..f8d7cb8c1453e 100644 --- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll +++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: JScop file changes the number of parameter dimensions. ; diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll index ae0b4edffb5fc..6e13a5e413d76 100644 --- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll +++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s + ; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Array has not a valid type. ; diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll index 6c434e15a38d2..7f6578776e0bd 100644 --- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll +++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: not --crash opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s ; ; #define Ni 1056 ; #define Nj 1056 diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll index b004c4725176a..e698bdc488c2c 100644 --- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll +++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Array has no key 'name'. ; diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll index 5f62a457f63eb..f130b6556e3e5 100644 --- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll +++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Array has no key 'sizes'. ; diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll index 029fde10f5a4a..68d2e50c6730d 100644 --- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll +++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s +; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Array has no key 'type'. ; diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll index 9ac371b655146..94c77dc2a0138 100644 --- a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll +++ b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: JScop file has no key named 'context'. ; diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll index 82afcd95c871f..c20d5c02d662e 100644 --- a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll +++ b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: The isl_set is not a parameter set. ; diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll index 0308452c6f955..92f4d61212e93 100644 --- a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll +++ b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: unexpected isl_token ; diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll index debb9bc604110..89668d8d573b1 100644 --- a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll +++ b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: Imported context has the wrong number of parameters : Found 2 Expected 1 ; diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll index 6eee0056ba0b5..efe15c14ce90d 100644 --- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll +++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: Statement 0 has no 'schedule' key. ; diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll index 59feb0085e6de..db516f6d7d335 100644 --- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll +++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: expecting other token ; diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll index 78d5243d34e00..b93c984d7d9dd 100644 --- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll +++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: JScop file has no key name 'statements'. ; diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll index 877547c8f317f..3fa14c64cd639 100644 --- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll +++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll @@ -1,4 +1,4 @@ -; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s +; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s ; ; CHECK: The number of indices and the number of statements differ. ; diff --git a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll index 9f999204f59bf..1d81ff7ef2dc8 100644 --- a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll +++ b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that the expansion of an array with load after store in a same statement is not done. ; diff --git a/polly/test/MaximalStaticExpansion/read_from_original.ll b/polly/test/MaximalStaticExpansion/read_from_original.ll index 1a733c113626d..57017381c661a 100644 --- a/polly/test/MaximalStaticExpansion/read_from_original.ll +++ b/polly/test/MaximalStaticExpansion/read_from_original.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that Polly detects problems and does not expand the array ; diff --git a/polly/test/MaximalStaticExpansion/too_many_writes.ll b/polly/test/MaximalStaticExpansion/too_many_writes.ll index a7aa162aa83da..7e33de17a1749 100644 --- a/polly/test/MaximalStaticExpansion/too_many_writes.ll +++ b/polly/test/MaximalStaticExpansion/too_many_writes.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that Polly detects problems and does not expand the array ; diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll index 06e08c43e3492..355fc02600d54 100644 --- a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll +++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s ; ; Verify that the accesses are correctly expanded for MemoryKind::Array ; diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll index 076f47143dbcc..930539547cc97 100644 --- a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll +++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that the accesses are correctly expanded for MemoryKind::Array and MemoryKind::PHI. ; tmp_06_phi is not expanded because it need copy in. diff --git a/polly/test/MaximalStaticExpansion/working_expansion.ll b/polly/test/MaximalStaticExpansion/working_expansion.ll index 2b040f3f1f4e3..a055e50225e91 100644 --- a/polly/test/MaximalStaticExpansion/working_expansion.ll +++ b/polly/test/MaximalStaticExpansion/working_expansion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s ; ; Verify that the accesses are correctly expanded for MemoryKind::Array ; diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll index f863c0e1d6edf..77338c9aac200 100644 --- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll +++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s ; ; Verify that the accesses are correctly expanded ; diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll index a823bdb4e7682..9cfa5536072b7 100644 --- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll +++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s ; ; Verify that the accesses are correctly expanded ; diff --git a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll index 0898f99c896d4..63e4d48046275 100644 --- a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll +++ b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that the accesses are correctly expanded for MemoryKind::PHI ; tmp_04 is not expanded because it need copy-in. diff --git a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll index 2a332ba7ce77b..87bd57abab8d1 100644 --- a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll +++ b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE ; ; Verify that the accesses are correctly expanded for MemoryKind::PHI ; tmp_05 and tmp2_06 are not expanded because they need copy-in. diff --git a/polly/test/MaximalStaticExpansion/working_value_expansion.ll b/polly/test/MaximalStaticExpansion/working_value_expansion.ll index 77f20bb163a8b..cc28a78c38671 100644 --- a/polly/test/MaximalStaticExpansion/working_value_expansion.ll +++ b/polly/test/MaximalStaticExpansion/working_value_expansion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s ; ; Verify that the accesses are correctly expanded for MemoryKind::Value ; diff --git a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll index b4524c21a35ee..9cc2aecf002dd 100644 --- a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll +++ b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=polly-custom<prune>' -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false "-passes=scop(polly-prune-unprofitable)" -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s ; REQUIRES: asserts ; ; Skip this SCoP for having scalar dependencies between all statements, diff --git a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll index c8c006c94d1d4..38facb1688c46 100644 --- a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll +++ b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -S < %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32" define void @sdbout_label() nounwind { diff --git a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll index 23033faa380af..835986049899b 100644 --- a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll +++ b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -S < %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Check that we handle statements with an empty iteration domain correctly. diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll index fdaed3c543673..5e4ce8225a236 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll index 65d495722c2bd..de4c387a1d879 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll index 06d55f46a977f..91bd549c3c7e4 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll index 0af703ccf5ffe..8b69d9e12c0fe 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT define void @func(i32 %n, ptr noalias nonnull %A) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll index ca6840b900e7f..49d1124740340 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll index f96e4baba71eb..a449a2fda9ba3 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s define void @func(i32 %n, ptr noalias nonnull %A) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll index 229d13aaf1a4d..798e9b9a7c14f 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s define void @func(i32 %n, ptr noalias nonnull %A) { entry: diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll index 9bc9a25ac588e..4d0ccc988a5cc 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s ; This could theoretically be fused by adjusting the offset of the second loop by %k (instead of relying on schedule dimensions). diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll index 5b0cefbe686f6..bf470b91a7022 100644 --- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll +++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) { entry: diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll index 2225f05f6717d..b0f75dd50ef83 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Check that the disable_nonforced metadata is honored; optimization ; heuristics/rescheduling must not be applied. diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll index 4add219214aa3..900360d7533f8 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON -; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON +; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF ; define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B) { entry: diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll index d59f9e58e2785..d45b62433dbbc 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines ; ; CHECK: warning: distribute_illegal.c:2:3: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll index a1caaf5db5a61..d835e66693fb4 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines ; ; CHECK: warning: distribute_illegal.c:1:42: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll index b05710203fd37..a5781a7f60365 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines ; ; Override unroll metadata with llvm.loop.unroll.disable. ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll index 8992bc942646e..cccf136a1c4ac 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines ; ; Apply two loop transformations. First partial, then full unrolling. ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll index 7bea96f791a80..4d499078a4364 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines ; ; Full unroll of a loop with 5 iterations. ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll index 34a6f486e646c..d67472ab86936 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines -; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines ; ; Unrolling with heuristic factor. ; Currently not supported and expected to be handled by LLVM's unroll pass. diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll index ce2281372a20d..90101b4fde390 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines -; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines +; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines +; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines ; ; Partial unroll by a factor of 4. ; diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll index f6810ba6c48fb..4cfa3fb911515 100644 --- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll +++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>' -S < %s | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines +; RUN: opt %loadNPMPolly '-passes=scop(polly-opt-isl,polly-codegen),simplifycfg' -S < %s | FileCheck %s --check-prefix=CODEGEN ; ; Partial unroll by a factor of 4. ; @@ -54,6 +54,6 @@ return: ; AST-NEXT: for (int c0 = 0; c0 < n; c0 += 4) { -; CODEGEN: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit, !llvm.loop ![[LOOPID:[0-9]+]] +; CODEGEN: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.exiting, !llvm.loop ![[LOOPID:[0-9]+]] ; CODEGEN: ![[LOOPID]] = distinct !{![[LOOPID]], ![[LOOPNAME:[0-9]+]]} ; CODEGEN: ![[LOOPNAME]] = !{!"llvm.loop.id", !"This-is-the-unrolled-loop"} diff --git a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll index b03d475dd42ee..3f6f50e34775d 100644 --- a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll +++ b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-parallel -polly-vectorizer=stripmine -passes=polly-codegen-verify '-passes=polly-custom<opt-isl;ast;codegen>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-parallel -polly-vectorizer=stripmine -passes=polly-codegen-verify '-passes=polly-opt-isl,print<polly-ast>,polly-codegen' -disable-output < %s | FileCheck %s ; ; Check that there are no nested #pragma omp parallel for inside a ; #pragma omp parallel for loop. diff --git a/polly/test/ScheduleOptimizer/computeout.ll b/polly/test/ScheduleOptimizer/computeout.ll index 6f34f4efc0a6d..a3286b481ffb3 100644 --- a/polly/test/ScheduleOptimizer/computeout.ll +++ b/polly/test/ScheduleOptimizer/computeout.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-isl-arg=--no-schedule-serialize-sccs -disable-output < %s | FileCheck %s -; RUN: opt -S %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-isl-arg=--no-schedule-serialize-sccs -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT +; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; for(i = 0; i < 100; i++ ) diff --git a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll index 4be0b948d09a0..928ee858ae6d2 100644 --- a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll +++ b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll @@ -1,4 +1,9 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-remarks-minimal '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=1 -polly-target-vector-register-bitwidth=4096 -polly-target-1st-cache-level-associativity=3 -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-remarks-minimal \ +; RUN: '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=1 \ +; RUN: -polly-target-vector-register-bitwidth=4096 \ +; RUN: -polly-target-1st-cache-level-associativity=3 -disable-output < %s | FileCheck %s ; ; /* Test that Polly does not crash due to configurations that can lead to ; incorrect tile size computations. diff --git a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll index 548a8aa94afbf..b533cb870bdcb 100644 --- a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll +++ b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -polly-vectorizer=stripmine -polly-invariant-load-hoisting -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-opt-isl>)" -polly-vectorizer=stripmine -polly-invariant-load-hoisting -disable-output < %s | FileCheck %s ; ; llvm.org/PR46578 ; diff --git a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll index 6de5e3a606aa3..3dd579ed736f7 100644 --- a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll +++ b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll @@ -1,4 +1,4 @@ -; RUN: opt -S %loadNPMPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; CHECK: // 1st level tiling - Tiles ; CHECK-NEXT: #pragma known-parallel ; CHECK-NEXT: for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1) diff --git a/polly/test/ScheduleOptimizer/line-tiling-2.ll b/polly/test/ScheduleOptimizer/line-tiling-2.ll index 6256adfcd6917..3a2c566d19d3d 100644 --- a/polly/test/ScheduleOptimizer/line-tiling-2.ll +++ b/polly/test/ScheduleOptimizer/line-tiling-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-tile-sizes=1,64 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-tile-sizes=1,64 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 7; c1 += 1) diff --git a/polly/test/ScheduleOptimizer/line-tiling.ll b/polly/test/ScheduleOptimizer/line-tiling.ll index 51e02594aa880..0dbdeff4742b9 100644 --- a/polly/test/ScheduleOptimizer/line-tiling.ll +++ b/polly/test/ScheduleOptimizer/line-tiling.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-tile-sizes=64,1 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-tile-sizes=64,1 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; CHECK: for (int c0 = 0; c0 <= 15; c0 += 1) ; CHECK: for (int c1 = 0; c1 <= 511; c1 += 1) diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll index 79deedc7cd830..8f270b94617fe 100644 --- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll +++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll @@ -1,4 +1,13 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 -polly-optimized-scops -polly-target-vector-register-bitwidth=256 -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: -polly-optimized-scops \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -disable-output < %s ; ; /* C := alpha*A*B + beta*C */ ; for (i = 0; i < _PB_NI; i++) diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll index e3ae1a02bd347..de1c815f92350 100644 --- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll +++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll @@ -1,4 +1,12 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 -polly-target-vector-register-bitwidth=256 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; ; /* C := alpha*A*B + beta*C */ ; /* _PB_NK % Kc != 0 */ diff --git a/polly/test/ScheduleOptimizer/one-dimensional-band.ll b/polly/test/ScheduleOptimizer/one-dimensional-band.ll index f37f1e5119a9f..a097d4a43cfd2 100644 --- a/polly/test/ScheduleOptimizer/one-dimensional-band.ll +++ b/polly/test/ScheduleOptimizer/one-dimensional-band.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; ; void jacobi1d(long T, long N, float *A, float *B) { ; long t, i, j; diff --git a/polly/test/ScheduleOptimizer/outer_coincidence.ll b/polly/test/ScheduleOptimizer/outer_coincidence.ll index e0a7a63cda80d..7c1af80c9ffae 100644 --- a/polly/test/ScheduleOptimizer/outer_coincidence.ll +++ b/polly/test/ScheduleOptimizer/outer_coincidence.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=OUTER +; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=OUTER ; By skewing, the diagonal can be made parallel. ISL does this when the Check ; the 'outer_coincidence' option is enabled. diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll index 84f1ca0dba652..a19b93d9915dd 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll @@ -1,4 +1,8 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true '-passes=polly-custom<optree;delicm;simplify;opt-isl>' -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly \ +; RUN: -polly-pattern-matching-based-opts=true \ +; RUN: '-passes=polly-optree,polly-delicm,polly-simplify,polly-opt-isl' \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s ; REQUIRES: asserts ; Check that the pattern matching detects the matrix multiplication pattern diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll index 72fb4f1b4e41c..4ef0605a0ba75 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm;simplify-1;opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-opt-isl' \ +; RUN: -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; Check that the pattern matching detects the tensor contraction pattern diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll index 933b2d4d258e7..09118e252233b 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll @@ -1,7 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=false -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=false \ +; RUN: -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS +; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines ; REQUIRES: asserts ; ; /* C := alpha*A*B + beta*C */ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll index 03e23038877e5..b771d1f87537e 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll @@ -1,4 +1,16 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;opt-isl>' -polly-import-jscop-postfix=transformed -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl' \ +; RUN: -polly-import-jscop-postfix=transformed \ +; RUN: -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: -debug \ +; RUN: -polly-tc-opt=true -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s ; REQUIRES: asserts ; ; Check that the pattern matching detects the matrix multiplication pattern diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll index 4e174e3c9723d..238f6dd798e68 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll @@ -1,4 +1,12 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl>' -disable-output < %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: -passes=polly-opt-isl -disable-output < %s ; ; Test whether isolation works as expected. ; diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll index c3d8b6ed3fee5..0e4540eb7ba3c 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll @@ -1,4 +1,12 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=2 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=128 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=2 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=128 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; ; Test whether isolation works as expected. ; diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll index 3705c3fd27ed9..9678ad83ff048 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll @@ -1,4 +1,13 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;opt-isl;ast;codegen>' -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl,polly-codegen' \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: -polly-import-jscop-postfix=transformed -S < %s \ +; RUN: | FileCheck %s ; ; Check that we disable the Loop Vectorizer. ; diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll index 7ada105828b27..e74884d59c311 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug-only=polly-opt-isl -disable-output -polly-tc-opt=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -debug-only=polly-opt-isl -disable-output \ +; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < _PB_NI; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll index 6647380b2d070..9c99a090b69e7 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 1024; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll index fba77d5e4f82d..8e14035ce8629 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 32; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll index 488436064ae83..4f562c306f96a 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 32; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll index c7a5d475bef31..32ded897d4ff9 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 8; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll index 1dba8bece8072..f0c0177da84b0 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; /* C := alpha*A*B + beta*C */ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll index 3656a9457cef2..155177bdfade0 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 16; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll index bd0cb054957af..3d21ac3859a7e 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (int i = 0; i < 32; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll index 6e6788be2973f..00a4bf885aef8 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (int i = 0; i < 32; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll index 82356ae0a398d..bfe5c5249a3a8 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-reschedule=0 '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-reschedule=0 -passes=polly-opt-isl \ +; RUN: -polly-pattern-matching-based-opts=true -polly-tc-opt=true \ +; RUN: -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (i = 0; i < 1024; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll index ea28bb8c0bdb6..a2e1ced3e6320 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; for (int i = 0; i < 32; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll index f80d63cd4d66c..9844d377e609d 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -1,6 +1,19 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-size=0 -polly-target-vector-register-bitwidth=256 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-size=0 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL ; ; /* C := alpha*A*B + beta*C */ ; for (i = 0; i < _PB_NI; i++) diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll index 100b17e2ccd21..250641d57bac5 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll @@ -1,5 +1,13 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -polly-tc-opt=true -disable-output < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ +; RUN: -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: -polly-tc-opt=true -disable-output < %s | \ +; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS ; REQUIRES: asserts ; ; C := A * B + C diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll index 050af1b2377d3..ad2c195ba1e8e 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll @@ -1,4 +1,12 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; ; opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ ; -polly-target-throughput-vector-fma=1 \ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll index ba1ddfef6a4e4..1d3cdbdbfdd85 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll @@ -1,4 +1,12 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; ; opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \ ; -polly-target-throughput-vector-fma=1 \ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll index e50b3a0a3f2b0..59eaa4a0928e9 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll @@ -1,4 +1,12 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; ; /* C := A * B + C */ ; /* Elements of the matrices A, B, C have the float type. */ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll index 3f57fe8cf6c73..2544d502a2dc5 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll @@ -1,4 +1,12 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; ; /* C := A * B + C */ ; /* Elements of the matrices B, C have the double type. */ diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll index b87ed4fb1ec3c..85c143562f5af 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll @@ -1,6 +1,14 @@ -; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl>' -disable-output < %s +; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \ +; RUN: -polly-target-throughput-vector-fma=1 \ +; RUN: -polly-target-latency-vector-fma=8 \ +; RUN: -polly-target-1st-cache-level-associativity=8 \ +; RUN: -polly-target-2nd-cache-level-associativity=8 \ +; RUN: -polly-target-1st-cache-level-size=32768 \ +; RUN: -polly-target-vector-register-bitwidth=256 \ +; RUN: -polly-target-2nd-cache-level-size=262144 \ +; RUN: -passes=polly-opt-isl -disable-output < %s ; -; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES +; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES ; ; /* C := A * B + C */ ; /* Elements of the matrices A, B, C have the char type. */ diff --git a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll index 98c1db6d36fbe..64285891a16c7 100644 --- a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll +++ b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;opt-isl>' -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -passes=polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; ; void pattern_matching_based_opts_splitmap(double C[static const restrict 2][2], double A[static const restrict 2][784], double B[static const restrict 784][2]) { diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll index 4784dc88cd307..1c6d289744e39 100644 --- a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll +++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll @@ -1,4 +1,4 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @C = common global [1536 x [1536 x float]] zeroinitializer, align 16 diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll index 6d1592c4ba8fa..1ff20d165ce5e 100644 --- a/polly/test/ScheduleOptimizer/prevectorization.ll +++ b/polly/test/ScheduleOptimizer/prevectorization.ll @@ -1,5 +1,5 @@ -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=VEC16 +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=VEC16 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll index f346e5365b198..0bc3c2cf642e8 100644 --- a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll +++ b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl>' -polly-debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine -passes=polly-opt-isl -polly-debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts define void @ham(ptr %arg, ptr %arg1, i32 %arg2, i32 %arg3, ptr %arg4, i32 %arg5, i32 %arg6) { diff --git a/polly/test/ScheduleOptimizer/rectangular-tiling.ll b/polly/test/ScheduleOptimizer/rectangular-tiling.ll index 3fd4907909419..e1d768b351d7d 100644 --- a/polly/test/ScheduleOptimizer/rectangular-tiling.ll +++ b/polly/test/ScheduleOptimizer/rectangular-tiling.ll @@ -1,7 +1,7 @@ -; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-tiling=false '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=NOTILING -; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL -; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER +; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-tiling=false '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=NOTILING +; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL +; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER ; CHECK: // 1st level tiling - Tiles ; CHECK: for (int c0 = 0; c0 <= 3; c0 += 1) diff --git a/polly/test/ScheduleOptimizer/schedule_computeout.ll b/polly/test/ScheduleOptimizer/schedule_computeout.ll index 1ee8a90473bd3..1e1359e3ecc6a 100644 --- a/polly/test/ScheduleOptimizer/schedule_computeout.ll +++ b/polly/test/ScheduleOptimizer/schedule_computeout.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -S '-passes=polly-custom<optree;delicm;opt-isl>' -polly-schedule-computeout=10000 -debug-only=polly-opt-isl < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-optree -passes=polly-delicm -passes=polly-opt-isl -polly-schedule-computeout=10000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; Bailout if the computations of schedule compute exceeds the max scheduling quota. diff --git a/polly/test/ScheduleOptimizer/statistics.ll b/polly/test/ScheduleOptimizer/statistics.ll index bb705ac6abf38..84eb59341d273 100644 --- a/polly/test/ScheduleOptimizer/statistics.ll +++ b/polly/test/ScheduleOptimizer/statistics.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -passes=polly-opt-isl -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; REQUIRES: asserts diff --git a/polly/test/ScheduleOptimizer/tile_after_fusion.ll b/polly/test/ScheduleOptimizer/tile_after_fusion.ll index e3d7c24ebef77..50a46d66176ea 100644 --- a/polly/test/ScheduleOptimizer/tile_after_fusion.ll +++ b/polly/test/ScheduleOptimizer/tile_after_fusion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-isl-arg=--no-schedule-serialize-sccs '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-isl-arg=--no-schedule-serialize-sccs '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s ; ; ; void tf(int C[256][256][256], int A0[256][256][256], int A1[256][256][256]) { diff --git a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll index bb472b9c3763f..e59a31665d77b 100644 --- a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll +++ b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s ; isl_schedule_node_band_sink may sink into multiple children. ; https://llvm.org/PR52637 diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll index d83c822371b6e..cee1c06cf7aa0 100644 --- a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll +++ b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: ; diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll index 63c9addd0b6e1..5506b3c626cfd 100644 --- a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll +++ b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: ; diff --git a/polly/test/ScopDetect/aliasing_simple_1.ll b/polly/test/ScopDetect/aliasing_simple_1.ll index ea8a7688f3d25..5f43ec1856a7f 100644 --- a/polly/test/ScopDetect/aliasing_simple_1.ll +++ b/polly/test/ScopDetect/aliasing_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: ; diff --git a/polly/test/ScopDetect/aliasing_simple_2.ll b/polly/test/ScopDetect/aliasing_simple_2.ll index df68289ff7352..e853dfcc64485 100644 --- a/polly/test/ScopDetect/aliasing_simple_2.ll +++ b/polly/test/ScopDetect/aliasing_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: ; diff --git a/polly/test/ScopDetect/base_pointer.ll b/polly/test/ScopDetect/base_pointer.ll index 0f0e219bd90d1..e500f9bc20bc6 100644 --- a/polly/test/ScopDetect/base_pointer.ll +++ b/polly/test/ScopDetect/base_pointer.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly --aa-pipeline= -polly-invariant-load-hoisting=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -disable-basic-aa -polly-invariant-load-hoisting=true -polly-print-detect -disable-output < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll index b00ec77679063..eeb9e11f812c3 100644 --- a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll +++ b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts;import-jscop>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s ; ; This violated an assertion in setNewAccessRelation that assumed base pointers ; to be load-hoisted. Without this assertion, it codegen would generate invalid diff --git a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll index 1cd04b639fc99..16976e6313275 100644 --- a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll +++ b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;import-jscop>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --allow-empty +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s --allow-empty ; ; Polly codegen used to generate invalid code (referring to %ptr from the ; original region) when regeneration of the access function is necessary. @@ -35,5 +35,3 @@ exit: ; CHECK-NOT: Valid Region for Scop -; CHECK: Detected Scops in Function base_pointer_is_inst_inside_invariant_1 -; CHECK-NOT: Valid Region for Scop diff --git a/polly/test/ScopDetect/callbr.ll b/polly/test/ScopDetect/callbr.ll index 4200339a04a13..4182974693678 100644 --- a/polly/test/ScopDetect/callbr.ll +++ b/polly/test/ScopDetect/callbr.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-track-failures -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STAT +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STAT ; REQUIRES: asserts ; REMARK: Branch from indirect terminator. diff --git a/polly/test/ScopDetect/collective_invariant_loads.ll b/polly/test/ScopDetect/collective_invariant_loads.ll index f5263e4e4c40a..f451bccec706f 100644 --- a/polly/test/ScopDetect/collective_invariant_loads.ll +++ b/polly/test/ScopDetect/collective_invariant_loads.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting -disable-output< %s 2>&1 | FileCheck %s ;CHECK: Function: test_init_chpl ;CHECK-NEXT: Region: %bb1---%bb16 diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit.ll b/polly/test/ScopDetect/cross_loop_non_single_exit.ll index d7605c36d449c..fe3922174c07c 100644 --- a/polly/test/ScopDetect/cross_loop_non_single_exit.ll +++ b/polly/test/ScopDetect/cross_loop_non_single_exit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll index c3a2ad4791ba7..4cac173932a6f 100644 --- a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll +++ b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll index e896e18589e94..7d7476471bb6e 100644 --- a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll +++ b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" define void @f(ptr %A, i64 %N, i64 %M) nounwind { diff --git a/polly/test/ScopDetect/detect-full-functions.ll b/polly/test/ScopDetect/detect-full-functions.ll index adad0e89ffa42..178ef32827cab 100644 --- a/polly/test/ScopDetect/detect-full-functions.ll +++ b/polly/test/ScopDetect/detect-full-functions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-process-unprofitable=false -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-process-unprofitable=false -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s ; Verify if a simple function with basic block not part of loop doesn't crash with polly-process-unprofitable=false and polly-detect-full-functions flags. diff --git a/polly/test/ScopDetect/dom-tree-crash.ll b/polly/test/ScopDetect/dom-tree-crash.ll index 0f670ca230824..efc732c50e177 100644 --- a/polly/test/ScopDetect/dom-tree-crash.ll +++ b/polly/test/ScopDetect/dom-tree-crash.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Detected Scops in Function foo diff --git a/polly/test/ScopDetect/dot-scops-npm.ll b/polly/test/ScopDetect/dot-scops-npm.ll index de1f52813475a..d14bf8a23a166 100644 --- a/polly/test/ScopDetect/dot-scops-npm.ll +++ b/polly/test/ScopDetect/dot-scops-npm.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-dot -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=polly-scop-printer' -disable-output < %s ; RUN: FileCheck %s -input-file=scops.func_npm.dot ; ; Check that the ScopPrinter does not crash. diff --git a/polly/test/ScopDetect/dot-scops.ll b/polly/test/ScopDetect/dot-scops.ll index a719d21300b15..63163b23617cf 100644 --- a/polly/test/ScopDetect/dot-scops.ll +++ b/polly/test/ScopDetect/dot-scops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,polly-scop-printer' -disable-output < %s ; ; Check that the ScopPrinter does not crash. ; ScopPrinter needs the ScopDetection pass, which should depend on diff --git a/polly/test/ScopDetect/error-block-always-executed.ll b/polly/test/ScopDetect/error-block-always-executed.ll index 0e82e37d10095..20d02b1c1ae0b 100644 --- a/polly/test/ScopDetect/error-block-always-executed.ll +++ b/polly/test/ScopDetect/error-block-always-executed.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid Region for Scop: diff --git a/polly/test/ScopDetect/error-block-referenced-from-scop.ll b/polly/test/ScopDetect/error-block-referenced-from-scop.ll index 338fe20679bcf..6c66f6df14af5 100644 --- a/polly/test/ScopDetect/error-block-referenced-from-scop.ll +++ b/polly/test/ScopDetect/error-block-referenced-from-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid Region for Scop: diff --git a/polly/test/ScopDetect/error-block-unreachable.ll b/polly/test/ScopDetect/error-block-unreachable.ll index 85f248da9be18..6ba7698a972bb 100644 --- a/polly/test/ScopDetect/error-block-unreachable.ll +++ b/polly/test/ScopDetect/error-block-unreachable.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s ; Verify that the scop detection does not crash on inputs with unreachable ; blocks. Earlier we crashed when detecting error blocks. diff --git a/polly/test/ScopDetect/expand-region-correctly-2.ll b/polly/test/ScopDetect/expand-region-correctly-2.ll index 43fdda8321cbe..a5c9626d28361 100644 --- a/polly/test/ScopDetect/expand-region-correctly-2.ll +++ b/polly/test/ScopDetect/expand-region-correctly-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer ; diff --git a/polly/test/ScopDetect/expand-region-correctly.ll b/polly/test/ScopDetect/expand-region-correctly.ll index b4caac4478d1d..a8c90c08fde0c 100644 --- a/polly/test/ScopDetect/expand-region-correctly.ll +++ b/polly/test/ScopDetect/expand-region-correctly.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer diff --git a/polly/test/ScopDetect/ignore_func_flag_regex.ll b/polly/test/ScopDetect/ignore_func_flag_regex.ll index ef1c66686251a..a75e705995a75 100644 --- a/polly/test/ScopDetect/ignore_func_flag_regex.ll +++ b/polly/test/ScopDetect/ignore_func_flag_regex.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-polly-ignore-func=f.*,g.*' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-ignore-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the flag `-polly-ignore-func` works with regexes. ; diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop.ll b/polly/test/ScopDetect/index_from_unpredictable_loop.ll index a6f7079f68407..f6d6cfab0eede 100644 --- a/polly/test/ScopDetect/index_from_unpredictable_loop.ll +++ b/polly/test/ScopDetect/index_from_unpredictable_loop.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll index be76e0b138933..16d47619b0ff2 100644 --- a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll +++ b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopDetect/indvars.ll b/polly/test/ScopDetect/indvars.ll index e45e4fb016155..3fbc4d65bbe20 100644 --- a/polly/test/ScopDetect/indvars.ll +++ b/polly/test/ScopDetect/indvars.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s ; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopDetect/intrinsics_1.ll b/polly/test/ScopDetect/intrinsics_1.ll index 43fa4ca619ed7..58c9197f7f799 100644 --- a/polly/test/ScopDetect/intrinsics_1.ll +++ b/polly/test/ScopDetect/intrinsics_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Valid Region for Scop: for.cond => for.end ; diff --git a/polly/test/ScopDetect/intrinsics_2.ll b/polly/test/ScopDetect/intrinsics_2.ll index b4cc3df7c746b..f71016e6d04cd 100644 --- a/polly/test/ScopDetect/intrinsics_2.ll +++ b/polly/test/ScopDetect/intrinsics_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we allow the lifetime markers for the tmp array. ; diff --git a/polly/test/ScopDetect/intrinsics_3.ll b/polly/test/ScopDetect/intrinsics_3.ll index 08fdee573ba0f..579d5bd481d44 100644 --- a/polly/test/ScopDetect/intrinsics_3.ll +++ b/polly/test/ScopDetect/intrinsics_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we allow the misc intrinsics. ; diff --git a/polly/test/ScopDetect/invalid-latch-conditions.ll b/polly/test/ScopDetect/invalid-latch-conditions.ll index c7d7c51e7d220..db4898c9c7bd7 100644 --- a/polly/test/ScopDetect/invalid-latch-conditions.ll +++ b/polly/test/ScopDetect/invalid-latch-conditions.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NALOOPS -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NALOOPS +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; The latch conditions of the outer loop are not affine, thus the loop cannot ; handled by the domain generation and needs to be overapproximated. diff --git a/polly/test/ScopDetect/invalidate_scalar_evolution.ll b/polly/test/ScopDetect/invalidate_scalar_evolution.ll index 977918eb5168d..ddef510ad4d9f 100644 --- a/polly/test/ScopDetect/invalidate_scalar_evolution.ll +++ b/polly/test/ScopDetect/invalidate_scalar_evolution.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/invariant-load-before-scop.ll b/polly/test/ScopDetect/invariant-load-before-scop.ll index 932c218170caf..10479643959cb 100644 --- a/polly/test/ScopDetect/invariant-load-before-scop.ll +++ b/polly/test/ScopDetect/invariant-load-before-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; The LoadInst %.b761 is defined outside the SCoP, hence is always constant ; within it. It is no "required invariant load". diff --git a/polly/test/ScopDetect/keep_going_expansion.ll b/polly/test/ScopDetect/keep_going_expansion.ll index efd81c695ca0d..074aae9ae95c9 100644 --- a/polly/test/ScopDetect/keep_going_expansion.ll +++ b/polly/test/ScopDetect/keep_going_expansion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-detect-track-failures -polly-detect-keep-going '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-detect-track-failures -polly-detect-keep-going '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopDetect/mod_ref_read_pointer.ll b/polly/test/ScopDetect/mod_ref_read_pointer.ll index c7972cc47a68d..64535d85f2ab1 100644 --- a/polly/test/ScopDetect/mod_ref_read_pointer.ll +++ b/polly/test/ScopDetect/mod_ref_read_pointer.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=MODREF -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=MODREF +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid Region for Scop: for.body => for.end ; MODREF: Valid Region for Scop: for.body => for.end diff --git a/polly/test/ScopDetect/more-than-one-loop.ll b/polly/test/ScopDetect/more-than-one-loop.ll index 1835342812b1f..30090652326d2 100644 --- a/polly/test/ScopDetect/more-than-one-loop.ll +++ b/polly/test/ScopDetect/more-than-one-loop.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Valid Region for Scop: diff --git a/polly/test/ScopDetect/multidim-with-undef-size.ll b/polly/test/ScopDetect/multidim-with-undef-size.ll index e89cea98ad21a..2a5f8b15534fa 100644 --- a/polly/test/ScopDetect/multidim-with-undef-size.ll +++ b/polly/test/ScopDetect/multidim-with-undef-size.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: Valid Region for Scop: bb14 => bb17 diff --git a/polly/test/ScopDetect/multidim.ll b/polly/test/ScopDetect/multidim.ll index cbe7d0708b853..91202373263f0 100644 --- a/polly/test/ScopDetect/multidim.ll +++ b/polly/test/ScopDetect/multidim.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK: Valid Region for Scop: bb19 => bb20 diff --git a/polly/test/ScopDetect/multidim_indirect_access.ll b/polly/test/ScopDetect/multidim_indirect_access.ll index 4af37ba064558..a9cd446d27670 100644 --- a/polly/test/ScopDetect/multidim_indirect_access.ll +++ b/polly/test/ScopDetect/multidim_indirect_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we will recognize this SCoP. ; diff --git a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll index 0286642f3c7a7..9c91fbfbe0b64 100644 --- a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll +++ b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/ScopDetect/nested_loop_single_exit.ll b/polly/test/ScopDetect/nested_loop_single_exit.ll index 89071df596807..a0742112b6e12 100644 --- a/polly/test/ScopDetect/nested_loop_single_exit.ll +++ b/polly/test/ScopDetect/nested_loop_single_exit.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s ; void f(long A[], long N) { ; long i, j; diff --git a/polly/test/ScopDetect/non-affine-conditional.ll b/polly/test/ScopDetect/non-affine-conditional.ll index b20828d9a7679..e74619cd87756 100644 --- a/polly/test/ScopDetect/non-affine-conditional.ll +++ b/polly/test/ScopDetect/non-affine-conditional.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/ScopDetect/non-affine-float-compare.ll b/polly/test/ScopDetect/non-affine-float-compare.ll index 77427397bac9d..9326cd4290380 100644 --- a/polly/test/ScopDetect/non-affine-float-compare.ll +++ b/polly/test/ScopDetect/non-affine-float-compare.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(float *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll index f6ae9fe8dd544..1ab6b35ae93f1 100644 --- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll +++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll @@ -1,7 +1,7 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; ; Here we have a non-affine loop but also a non-affine access which should ; be rejected as long as -polly-allow-nonaffine isn't given. diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll index 23c1765caecac..921f6ab535499 100644 --- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll +++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES ; ; Here we have a non-affine loop (in the context of the loop nest) ; and also a non-affine access (A[k]). While we can always detect the diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll index 6e239a6570668..78774d92e0a46 100644 --- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll +++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES ; ; Here we have a non-affine loop (in the context of the loop nest) ; and also a non-affine access (A[k]). While we can always detect the diff --git a/polly/test/ScopDetect/non-affine-loop.ll b/polly/test/ScopDetect/non-affine-loop.ll index dd675ccec5999..5136b3b8779b1 100644 --- a/polly/test/ScopDetect/non-affine-loop.ll +++ b/polly/test/ScopDetect/non-affine-loop.ll @@ -1,8 +1,8 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; ; This function/region does contain a loop, however it is non-affine, hence the access ; A[i] is also. Furthermore, it is the only loop, thus when we over approximate diff --git a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll index 63b1cdb420b71..fd52c5df7b27e 100644 --- a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll +++ b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid ; diff --git a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll index ff4ad3218ffa5..d0c1f7a613332 100644 --- a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll +++ b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Valid Region for Scop: bb11 => bb25 diff --git a/polly/test/ScopDetect/non-simple-memory-accesses.ll b/polly/test/ScopDetect/non-simple-memory-accesses.ll index 5b9ed2b2ecae7..bdc48984f9961 100644 --- a/polly/test/ScopDetect/non-simple-memory-accesses.ll +++ b/polly/test/ScopDetect/non-simple-memory-accesses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we do not model atomic memory accesses. We did not reason about ; how to handle them correctly and the Alias Set Tracker models some of them diff --git a/polly/test/ScopDetect/non_affine_loop_condition.ll b/polly/test/ScopDetect/non_affine_loop_condition.ll index 3c487374c1973..63bd7b3a2f1f2 100644 --- a/polly/test/ScopDetect/non_affine_loop_condition.ll +++ b/polly/test/ScopDetect/non_affine_loop_condition.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; ; void f(int *A) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/ScopDetect/only-one-affine-loop.ll b/polly/test/ScopDetect/only-one-affine-loop.ll index a8ce5bc636833..1d36f4df35bc3 100644 --- a/polly/test/ScopDetect/only-one-affine-loop.ll +++ b/polly/test/ScopDetect/only-one-affine-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; Even if we allow non-affine loops we can only model the outermost loop, all ; other loops are boxed in non-affine regions. However, the inner loops can be diff --git a/polly/test/ScopDetect/only_func_flag.ll b/polly/test/ScopDetect/only_func_flag.ll index f4f35048fa8a0..4742375fec5cf 100644 --- a/polly/test/ScopDetect/only_func_flag.ll +++ b/polly/test/ScopDetect/only_func_flag.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-only-func=f,g '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-only-func=f,g '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the flag `-polly-only-func` limits analysis to `f` and `g`. ; diff --git a/polly/test/ScopDetect/only_func_flag_regex.ll b/polly/test/ScopDetect/only_func_flag_regex.ll index f180fa765f4b0..2ad22c9f7a7f5 100644 --- a/polly/test/ScopDetect/only_func_flag_regex.ll +++ b/polly/test/ScopDetect/only_func_flag_regex.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-polly-only-func=f.*,g.*' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-only-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the flag `-polly-only-func` works with regexes. ; diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll index 71d1ba0accd32..271825a58c399 100644 --- a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll +++ b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: Valid Region diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev.ll b/polly/test/ScopDetect/parametric-multiply-in-scev.ll index 6768c969a7428..2ab8997c63331 100644 --- a/polly/test/ScopDetect/parametric-multiply-in-scev.ll +++ b/polly/test/ScopDetect/parametric-multiply-in-scev.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; foo(float *A, long n, long k) { ; if (true) diff --git a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll index 2e16b75ee3106..248bb43aacd98 100644 --- a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll +++ b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; Region with an exit node that has a PHI node multiple incoming edges from ; inside the region. Motivation for supporting such cases in Polly. diff --git a/polly/test/ScopDetect/profitability-large-basic-blocks.ll b/polly/test/ScopDetect/profitability-large-basic-blocks.ll index ac27016e3622d..d74185b45c752 100644 --- a/polly/test/ScopDetect/profitability-large-basic-blocks.ll +++ b/polly/test/ScopDetect/profitability-large-basic-blocks.ll @@ -1,8 +1,12 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-detect-profitability-min-per-loop-insts=40 '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \ +; RUN: -polly-detect-profitability-min-per-loop-insts=40 \ +; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE -; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE +; RUN: opt %loadNPMPolly -polly-process-unprofitable=true \ +; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE -; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNPROFITABLE +; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \ +; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNPROFITABLE ; UNPROFITABLE-NOT: Valid Region for Scop: ; PROFITABLE: Valid Region for Scop: diff --git a/polly/test/ScopDetect/profitability-two-nested-loops.ll b/polly/test/ScopDetect/profitability-two-nested-loops.ll index 80379bcc5d412..0291d3be452a1 100644 --- a/polly/test/ScopDetect/profitability-two-nested-loops.ll +++ b/polly/test/ScopDetect/profitability-two-nested-loops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Valid Region for Scop: next => bb3 ; diff --git a/polly/test/ScopDetect/remove_all_children.ll b/polly/test/ScopDetect/remove_all_children.ll index 1c77d730ed418..d95e9bde0b384 100644 --- a/polly/test/ScopDetect/remove_all_children.ll +++ b/polly/test/ScopDetect/remove_all_children.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/polly/test/ScopDetect/report-scop-location.ll b/polly/test/ScopDetect/report-scop-location.ll index 530a22f9ac3d4..5e4c38db5e53c 100644 --- a/polly/test/ScopDetect/report-scop-location.ll +++ b/polly/test/ScopDetect/report-scop-location.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-report -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-report -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128" ; Function Attrs: nounwind uwtable diff --git a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll index 2ade0a97a5991..f49190b33ccf7 100644 --- a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll +++ b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: Valid Region for Scop: target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopDetect/run_time_alias_check.ll b/polly/test/ScopDetect/run_time_alias_check.ll index 6f327e318082c..74cbedb34e5c6 100644 --- a/polly/test/ScopDetect/run_time_alias_check.ll +++ b/polly/test/ScopDetect/run_time_alias_check.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/polly/test/ScopDetect/scev_remove_max.ll b/polly/test/ScopDetect/scev_remove_max.ll index 4f03845795c9c..f76c832ff08f5 100644 --- a/polly/test/ScopDetect/scev_remove_max.ll +++ b/polly/test/ScopDetect/scev_remove_max.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s ; This test case helps to determine whether SCEVRemoveMax::remove produces ; an infinite loop and a segmentation fault, if it processes, for example, diff --git a/polly/test/ScopDetect/sequential_loops.ll b/polly/test/ScopDetect/sequential_loops.ll index 338a9ae6b6b0e..4a84f356f3e81 100644 --- a/polly/test/ScopDetect/sequential_loops.ll +++ b/polly/test/ScopDetect/sequential_loops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/polly/test/ScopDetect/simple_loop.ll b/polly/test/ScopDetect/simple_loop.ll index 5da4898517e22..33823b21fb8fb 100644 --- a/polly/test/ScopDetect/simple_loop.ll +++ b/polly/test/ScopDetect/simple_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/simple_loop_non_single_entry.ll b/polly/test/ScopDetect/simple_loop_non_single_entry.ll index 00e11ab252e73..1bba2c21c7473 100644 --- a/polly/test/ScopDetect/simple_loop_non_single_entry.ll +++ b/polly/test/ScopDetect/simple_loop_non_single_entry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit.ll b/polly/test/ScopDetect/simple_loop_non_single_exit.ll index 9f75b80f58cef..93ec84e911c5d 100644 --- a/polly/test/ScopDetect/simple_loop_non_single_exit.ll +++ b/polly/test/ScopDetect/simple_loop_non_single_exit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll index c6ce482403400..33b0d8d7d6fc0 100644 --- a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll +++ b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll index c90c4915e866d..9b47b7c946caf 100644 --- a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll +++ b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/simple_loop_with_param.ll b/polly/test/ScopDetect/simple_loop_with_param.ll index 67f677892313c..4a0a3adab661d 100644 --- a/polly/test/ScopDetect/simple_loop_with_param.ll +++ b/polly/test/ScopDetect/simple_loop_with_param.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI ; void f(long A[], long N, long *init_ptr) { ; long i, j; diff --git a/polly/test/ScopDetect/simple_loop_with_param_2.ll b/polly/test/ScopDetect/simple_loop_with_param_2.ll index 9e7b55efc48d9..670936b6fee80 100644 --- a/polly/test/ScopDetect/simple_loop_with_param_2.ll +++ b/polly/test/ScopDetect/simple_loop_with_param_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; diff --git a/polly/test/ScopDetect/simple_non_single_entry.ll b/polly/test/ScopDetect/simple_non_single_entry.ll index e56c022aa5466..6ace3b636019b 100644 --- a/polly/test/ScopDetect/simple_non_single_entry.ll +++ b/polly/test/ScopDetect/simple_non_single_entry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetect/skip_function_attribute.ll b/polly/test/ScopDetect/skip_function_attribute.ll index 789942a950051..2150a3e8c35dd 100644 --- a/polly/test/ScopDetect/skip_function_attribute.ll +++ b/polly/test/ScopDetect/skip_function_attribute.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify polly skips this function ; diff --git a/polly/test/ScopDetect/srem_with_parametric_divisor.ll b/polly/test/ScopDetect/srem_with_parametric_divisor.ll index 471602968055e..66c3b045f62a4 100644 --- a/polly/test/ScopDetect/srem_with_parametric_divisor.ll +++ b/polly/test/ScopDetect/srem_with_parametric_divisor.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid Region for Scop: ; diff --git a/polly/test/ScopDetect/statistics.ll b/polly/test/ScopDetect/statistics.ll index 5d87599da29f7..a1dcebec63ff8 100644 --- a/polly/test/ScopDetect/statistics.ll +++ b/polly/test/ScopDetect/statistics.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -stats -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -stats -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts diff --git a/polly/test/ScopDetect/switch-in-loop-patch.ll b/polly/test/ScopDetect/switch-in-loop-patch.ll index 1e825f4950afa..2f9b670384db2 100644 --- a/polly/test/ScopDetect/switch-in-loop-patch.ll +++ b/polly/test/ScopDetect/switch-in-loop-patch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: Valid diff --git a/polly/test/ScopDetect/tlr_is_hoistable_load.ll b/polly/test/ScopDetect/tlr_is_hoistable_load.ll index 24a3f55a519e2..5c33522f62325 100644 --- a/polly/test/ScopDetect/tlr_is_hoistable_load.ll +++ b/polly/test/ScopDetect/tlr_is_hoistable_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting -polly-detect-full-functions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-detect-full-functions -polly-print-scops -disable-output < %s | FileCheck %s ; ; This testcase checks for compatibility of the -detect-full-functions ; flag in combination with the -invariant-load-hoisting option. More diff --git a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll index e7245d80b60ed..4ae86a940e0c8 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s ;void f(int A[], int B[]) { ; for (int i=0; i<42; i++) diff --git a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll index 2a0b281073f59..adb14b5b017d4 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s ; CHECK: remark: <unknown>:0:0: Scop contains function entry (not yet supported). diff --git a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll index fc4c1fbcef484..428a7cf855f6e 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; #define N 1024 ; double invalidCall(double A[N]); diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll index 7a540d606eadf..30e5fb9fdeba8 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ;void foo(int a, int b) { diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll index 512366f1bc7ce..2bc515e0ae5e1 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s ; CHECK: remark: <unknown>:0:0: Irreducible region encountered in control flow. diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll index e844aea24ac26..a96b64e4e0d54 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll @@ -1,6 +1,16 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-process-unprofitable=false -polly-detect-track-failures -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEALL +; RUN: opt %loadNPMPolly \ +; RUN: -pass-remarks-missed="polly-detect" -polly-detect-track-failures \ +; RUN: -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output \ +; RUN: < %s 2>&1| FileCheck %s --check-prefix=REJECTNONAFFINELOOPS +; RUN: opt %loadNPMPolly \ +; RUN: -pass-remarks-missed="polly-detect" -polly-detect-track-failures \ +; RUN: -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output \ +; RUN: < %s 2>&1| FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS +; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \ +; RUN: -polly-process-unprofitable=false \ +; RUN: -polly-detect-track-failures -polly-allow-nonaffine-loops=true \ +; RUN: -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=ALLOWNONAFFINEALL ; void f(int A[], int n) { ; for (int i = 0; i < A[n+i]; i++) diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll index d80911cc0ec9a..6156efaea1909 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll @@ -4,8 +4,8 @@ ; the PostDominatorTree. Infinite loops are postdominated only by the virtual ; root, which causes them not to appear in regions in ScopDetection anymore. -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s ; void func (int param0, int N, int *A) ; { diff --git a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll index d8c2916cc23bb..dd95bd6ede715 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll @@ -1,9 +1,9 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1 | FileCheck %s -check-prefix=ALL -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DELIN -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-keep-going -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DELIN-ALL -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=ALL +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN-ALL +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE ; 1 void manyaccesses(float A[restrict], long n, float B[restrict][n]) ; 2 { diff --git a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll index ee0aa743f434b..13ac9d5ace2d3 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s ; void f(int A[]) { ; for(int i=0; i<42; ++i) diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll index ad2c813c4b7ce..93e9e8b14038b 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll @@ -1,6 +1,10 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-process-unprofitable=false < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \ +; RUN: -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \ +; RUN: -polly-process-unprofitable=false < %s 2>&1| FileCheck %s -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-process-unprofitable=false -pass-remarks-output=%t.yaml < %s 2>&1 +; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \ +; RUN: -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \ +; RUN: -polly-process-unprofitable=false < %s 2>&1 -pass-remarks-output=%t.yaml ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll index d97032c8f8eaf..d110cfefc27dd 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s \ +; RUN: -pass-remarks-missed="polly-detect" 2>&1 | FileCheck %s ; void f(long A[], long N) { ; long i; diff --git a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll index 7a5025c0c2fbe..5f296fae9532b 100644 --- a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll +++ b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s ; struct b { ; double **b; diff --git a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll index e15c045907ddf..3cdeed13ec285 100644 --- a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll +++ b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output 2>&1 < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output 2>&1 < %s | FileCheck %s -match-full-lines ; ; Derived from test-suite/MultiSource/Benchmarks/BitBench/uuencode/uuencode.c ; diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll index b5918d9f7a2d4..4a9a200d67dfd 100644 --- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll +++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region. diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll index 502abf8dab6d7..61ff033d9f934 100644 --- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll +++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region. ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region. diff --git a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll index accb562771819..c5efec3f50c58 100644 --- a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll +++ b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll @@ -1,5 +1,5 @@ ; This should be run without alias analysis enabled. -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +;RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" define i32 @main() nounwind { diff --git a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll index 57ae977a1a13f..81c7efb963652 100644 --- a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll +++ b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" declare void @foo() diff --git a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll index 3cb63cc4f952c..5abf8ff29ef85 100644 --- a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll +++ b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32" diff --git a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll index 668fcd8fabcaf..d16ba453f9815 100644 --- a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll +++ b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/Alias-0.ll b/polly/test/ScopInfo/Alias-0.ll index 50c1b65727eaf..ebbe744627ef8 100644 --- a/polly/test/ScopInfo/Alias-0.ll +++ b/polly/test/ScopInfo/Alias-0.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/Alias-1.ll b/polly/test/ScopInfo/Alias-1.ll index 15fd6c936fc47..b1711c25857d0 100644 --- a/polly/test/ScopInfo/Alias-1.ll +++ b/polly/test/ScopInfo/Alias-1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/Alias-2.ll b/polly/test/ScopInfo/Alias-2.ll index 598ad0fe8cf1c..b94f130c94ebd 100644 --- a/polly/test/ScopInfo/Alias-2.ll +++ b/polly/test/ScopInfo/Alias-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/Alias-3.ll b/polly/test/ScopInfo/Alias-3.ll index 388a2defec395..af7816546b4ab 100644 --- a/polly/test/ScopInfo/Alias-3.ll +++ b/polly/test/ScopInfo/Alias-3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/Alias-4.ll b/polly/test/ScopInfo/Alias-4.ll index e9f4f95a9997f..fe651c87b241c 100644 --- a/polly/test/ScopInfo/Alias-4.ll +++ b/polly/test/ScopInfo/Alias-4.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA -; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA +; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA +; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA ; REQUIRES: asserts target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/BoundChecks/single-loop.ll b/polly/test/ScopInfo/BoundChecks/single-loop.ll index d44c18cf49e36..0b69beaaf3f9c 100644 --- a/polly/test/ScopInfo/BoundChecks/single-loop.ll +++ b/polly/test/ScopInfo/BoundChecks/single-loop.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; This only works after the post-dominator tree has been fixed. ; diff --git a/polly/test/ScopInfo/BoundChecks/two-loops.ll b/polly/test/ScopInfo/BoundChecks/two-loops.ll index 9034f75f13792..f2ba17d33c0ea 100644 --- a/polly/test/ScopInfo/BoundChecks/two-loops.ll +++ b/polly/test/ScopInfo/BoundChecks/two-loops.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; This only works after the post-dominator tree has fixed. ; XFAIL: * diff --git a/polly/test/ScopInfo/NonAffine/div_backedge.ll b/polly/test/ScopInfo/NonAffine/div_backedge.ll index e8edad9494075..3b0c673ece38b 100644 --- a/polly/test/ScopInfo/NonAffine/div_backedge.ll +++ b/polly/test/ScopInfo/NonAffine/div_backedge.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(float *A) { ; for (long i = 1;; i++) { diff --git a/polly/test/ScopInfo/NonAffine/div_domain.ll b/polly/test/ScopInfo/NonAffine/div_domain.ll index c195bb42dac9f..34a5cecdfe3df 100644 --- a/polly/test/ScopInfo/NonAffine/div_domain.ll +++ b/polly/test/ScopInfo/NonAffine/div_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(float *A) { ; for (long i = 0; i < 16; i++) { diff --git a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll index 31ecdaa0ef3e4..7d02fae7f98f3 100644 --- a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll +++ b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int *B, int *C) { ; for (int i = 0; i < 1000; i++) diff --git a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll index e0cd1e51a095c..d5c808d9021f2 100644 --- a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll +++ b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Domain := ; CHECK: { Stmt_for_body[i0] : 0 <= i0 <= 6 }; diff --git a/polly/test/ScopInfo/NonAffine/modulo_domain.ll b/polly/test/ScopInfo/NonAffine/modulo_domain.ll index 53bbe15799e61..13fe53f11633d 100644 --- a/polly/test/ScopInfo/NonAffine/modulo_domain.ll +++ b/polly/test/ScopInfo/NonAffine/modulo_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; TODO: The new domain generation cannot handle modulo domain constraints, ; hence modulo handling has been disabled completely. Once this is diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll index 7d34ef9644b5a..2b8427d74ec84 100644 --- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll +++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALAR -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFIT +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALAR +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFIT ; ; SCALAR: Function: f ; SCALAR-NEXT: Region: %bb1---%bb13 diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll index a40afdde1237f..30f756e81e474 100644 --- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll +++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL ; ; Here we have a non-affine loop (in the context of the loop nest) ; and also a non-affine access (A[k]). While we can always model the diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll index f3678d3245f57..6dacd719862ef 100644 --- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll +++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL ; ; Here we have a non-affine loop (in the context of the loop nest) ; and also a non-affine access (A[k]). While we can always model the diff --git a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll index 85a1081159d59..8a13f791ed6de 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A) { ; for (int i = 0; i < 128; i++) diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll index 65513a5d9d1fb..1e70d2c9db87e 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_for_body diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll index 0185774d6274c..dcfaa9280dcb8 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void pos(float *A, long n) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll index ab47dc0b78260..24bfe60502163 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll index 51a7d54562780..931ad36d15f34 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll @@ -1,5 +1,12 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-invariant-load-hoisting=true -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -polly-allow-nonaffine-loops=true \ +; RUN: '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -polly-allow-nonaffine \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \ +; RUN: '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s \ +; RUN: --check-prefix=ALL ; ; Negative test for INNERMOST. ; At the moment we will optimistically assume A[i] in the conditional before the inner diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll index b1f7e65e9dd25..37b51cebd74d5 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll @@ -1,6 +1,16 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-invariant-load-hoisting=true -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-process-unprofitable=false -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -polly-allow-nonaffine-loops=true \ +; RUN: '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST +; RUN: opt %loadNPMPolly -polly-allow-nonaffine \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \ +; RUN: '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL +; RUN: opt %loadNPMPolly -polly-allow-nonaffine \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -polly-process-unprofitable=false \ +; RUN: -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \ +; RUN: '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; ; Negative test for INNERMOST. ; At the moment we will optimistically assume A[i] in the conditional before the inner diff --git a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll index ac77dfb7454d3..7bfd7f86efcdb 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(float *A) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll index db08544aa559c..fc779d544e62f 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll @@ -1,6 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NO-REDUCTION +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NO-REDUCTION ; ; void f(int *A, int *C) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll index cde2dc495d549..63ff354d7e5f7 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT ; ; Verify that we over approximate the read access of A[j] in the last statement as j is ; computed in a non-affine loop we do not model. diff --git a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll index ce4cc6189d45c..d33befe2c66e0 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, double A[], int INDEX[]) { diff --git a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll index b46ce87a45e2d..77c2df48d6514 100644 --- a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll +++ b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-detect '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-detect '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll index 58e5ccd9b6e36..9ed340d1d304b 100644 --- a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll +++ b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s ; ; Regression test that triggered a memory leak at some point (24947). ; diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll index d94fc5f8a8823..cbd024ba7a392 100644 --- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll +++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that there is no alias group because we either access A or B never both. ; diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll index df7f75dd8d95e..3858d8a7bb1d6 100644 --- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll +++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we create two alias groups since the minimal/maximal accesses ; depend on %b. diff --git a/polly/test/ScopInfo/aliasing_dead_access.ll b/polly/test/ScopInfo/aliasing_dead_access.ll index 0ebc39c0e5a78..7baa3dce1f9db 100644 --- a/polly/test/ScopInfo/aliasing_dead_access.ll +++ b/polly/test/ScopInfo/aliasing_dead_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not create a SCoP if there is no statement executed. ; diff --git a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll index 8e5bab661e18c..7265aab22a490 100644 --- a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll +++ b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll @@ -1,5 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FOUND -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -polly-rtc-max-arrays-per-group=3 < %s 2>&1 | FileCheck %s --check-prefix=IGNORED +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \ +; RUN: < %s 2>&1 | FileCheck %s --check-prefix=FOUND +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \ +; RUN: -polly-rtc-max-arrays-per-group=3 < %s 2>&1 | FileCheck %s \ +; RUN: --check-prefix=IGNORED ; ; FOUND: Function: foo ; IGNORED-NOT: Function: foo diff --git a/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll b/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll index aec6ea0bf1441..c7592bcb09fcf 100644 --- a/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll +++ b/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-analysis-computeout=0 '-passes=polly-custom<scops>' -polly-print-scops -polly-rtc-max-parameters=8 -disable-output < %s | FileCheck %s --check-prefix=MAX8 -; RUN: opt %loadNPMPolly -polly-analysis-computeout=0 '-passes=polly-custom<scops>' -polly-print-scops -polly-rtc-max-parameters=7 -disable-output < %s | FileCheck %s --check-prefix=MAX7 +; RUN: opt %loadPolly -polly-analysis-computeout=0 -polly-print-scops -polly-rtc-max-parameters=8 -disable-output < %s | FileCheck %s --check-prefix=MAX8 +; RUN: opt %loadPolly -polly-analysis-computeout=0 -polly-print-scops -polly-rtc-max-parameters=7 -disable-output < %s | FileCheck %s --check-prefix=MAX7 ; ; Check that we allow this SCoP even though it has 10 parameters involved in possibly aliasing accesses. ; However, only 7 are involved in accesses through B, 8 through C and none in accesses through A. diff --git a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll index a7dbe0baeae5d..d66a10bc511b1 100644 --- a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll +++ b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: { : } diff --git a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll index db54a1687b4d5..9943802ec8595 100644 --- a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll +++ b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -aa-pipeline= < %s 2>&1 | FileCheck %s --check-prefix=NOAA -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -aa-pipeline=tbaa < %s 2>&1 | FileCheck %s --check-prefix=TBAA +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline= < %s 2>&1 | FileCheck %s --check-prefix=NOAA +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline=tbaa < %s 2>&1 | FileCheck %s --check-prefix=TBAA ; ; void jd(int *Int0, int *Int1, float *Float0, float *Float1) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll index 0001b8adb41e1..900d5d40d96f5 100644 --- a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll +++ b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s ; ; @test1 ; Make sure we generate the correct aliasing check for a fixed-size memset operation. diff --git a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll index 93253b7e65d4a..70c3c56fb3112 100644 --- a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll +++ b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll @@ -1,9 +1,14 @@ -; RUN: opt %loadNPMPolly -disable-output -polly-invariant-load-hoisting -polly-allow-dereference-of-all-function-parameters '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -disable-output -polly-invariant-load-hoisting \ +; RUN: -polly-allow-dereference-of-all-function-parameters \ +; RUN: '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s --check-prefix=SCOP -; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=CODE-RTC +; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \ +; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE-RTC -; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting -polly-allow-dereference-of-all-function-parameters '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=CODE +; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \ +; RUN: -polly-allow-dereference-of-all-function-parameters \ +; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE ; SCOP: Function: hoge ; SCOP-NEXT: Region: %bb15---%bb37 diff --git a/polly/test/ScopInfo/assume_gep_bounds.ll b/polly/test/ScopInfo/assume_gep_bounds.ll index 994d49e5b887f..bd14e3868d525 100644 --- a/polly/test/ScopInfo/assume_gep_bounds.ll +++ b/polly/test/ScopInfo/assume_gep_bounds.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void foo(float A[][20][30], long n, long m, long p) { ; for (long i = 0; i < n; i++) diff --git a/polly/test/ScopInfo/assume_gep_bounds_2.ll b/polly/test/ScopInfo/assume_gep_bounds_2.ll index be43be598bd3d..7a8c1870abe25 100644 --- a/polly/test/ScopInfo/assume_gep_bounds_2.ll +++ b/polly/test/ScopInfo/assume_gep_bounds_2.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-precise-inbounds < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: -polly-precise-inbounds | FileCheck %s ; ; void foo(float A[restrict][20], float B[restrict][20], long n, long m, ; long p) { diff --git a/polly/test/ScopInfo/assume_gep_bounds_many.ll b/polly/test/ScopInfo/assume_gep_bounds_many.ll index cfd9008741c3a..01fc12cd7f108 100644 --- a/polly/test/ScopInfo/assume_gep_bounds_many.ll +++ b/polly/test/ScopInfo/assume_gep_bounds_many.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' -polly-ignore-aliasing \ +; RUN: < %s 2>&1 | FileCheck %s ; CHECK: Assumed Context: ; CHECK-NEXT: [n1_a, n1_b, n1_c, n1_d, n2_a, n2_b, n2_c, n2_d, n3_a, n3_b, n3_c, n3_d, n4_a, n4_b, n4_c, n4_d, n5_a, n5_b, n5_c, n5_d, n6_a, n6_b, n6_c, n6_d, n7_a, n7_b, n7_c, n7_d, n8_a, n8_b, n8_c, n8_d, n9_a, n9_b, n9_c, n9_d, p1_b, p1_c, p1_d, p2_b, p2_c, p2_d, p3_b, p3_c, p3_d, p4_b, p4_c, p4_d, p5_b, p5_c, p5_d, p6_b, p6_c, p6_d, p7_b, p7_c, p7_d, p8_b, p8_c, p8_d, p9_b, p9_c, p9_d] -> { : p1_b >= n1_b and p1_c >= n1_c and p1_d >= n1_d and p2_b >= n2_b and p2_c >= n2_c and p2_d >= n2_d and p3_b >= n3_b and p3_c >= n3_c and p3_d >= n3_d and p4_b >= n4_b and p4_c >= n4_c and p4_d >= n4_d and p5_b >= n5_b and p5_c >= n5_c and p5_d >= n5_d and p6_b >= n6_b and p6_c >= n6_c and p6_d >= n6_d and p7_b >= n7_b and p7_c >= n7_c and p7_d >= n7_d and p8_b >= n8_b and p8_c >= n8_c and p8_d >= n8_d and p9_b >= n9_b and p9_c >= n9_c and p9_d >= n9_d } diff --git a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll index b3aa7686d3010..3fb7a1329c745 100644 --- a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll +++ b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do no introduce a parameter here that is actually not needed. ; diff --git a/polly/test/ScopInfo/bool-addrec.ll b/polly/test/ScopInfo/bool-addrec.ll index 01c6d52c30f76..81fcade08f65a 100644 --- a/polly/test/ScopInfo/bool-addrec.ll +++ b/polly/test/ScopInfo/bool-addrec.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<ast>' -polly-print-ast -polly-process-unprofitable < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-ast>' -polly-process-unprofitable < %s 2>&1 | FileCheck %s ; CHECK: for (int c0 = 0; c0 <= 19999; c0 += 1) { ; CHECK-NEXT: if (c0 % 2 == 0) diff --git a/polly/test/ScopInfo/bounded_loop_assumptions.ll b/polly/test/ScopInfo/bounded_loop_assumptions.ll index 21ba391f4fc1a..5628092de7765 100644 --- a/polly/test/ScopInfo/bounded_loop_assumptions.ll +++ b/polly/test/ScopInfo/bounded_loop_assumptions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; The assumed context is tricky here as the equality test for the inner loop ; allows an "unbounded" loop trip count. We assume that does not happen, thus diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll index d25a8e666b525..83743e4e4ecc7 100644 --- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll +++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll @@ -1,6 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | \ +; RUN: FileCheck %s -check-prefix=DETECT -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOP +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \ +; RUN: FileCheck %s -check-prefix=SCOP ; DETECT: Valid Region for Scop: loop => barrier ; DETECT-NEXT: Valid Region for Scop: branch => end diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll index 91aa96e0f3501..9685ba37a49a1 100644 --- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll +++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll @@ -1,5 +1,8 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine-branches=false < %s 2>&1 | FileCheck %s -check-prefix=NO-NONEAFFINE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \ +; RUN: FileCheck %s -check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-allow-nonaffine-branches=false < %s 2>&1 | \ +; RUN: FileCheck %s -check-prefix=NO-NONEAFFINE ; NONAFFINE: Statements { ; NONAFFINE-NEXT: Stmt_loop diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll index 22a60c764eb4d..f41e6500fb30a 100644 --- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll +++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll @@ -1,5 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -polly-allow-nonaffine-branches=false < %s 2>&1 | FileCheck %s -check-prefix=NO-NONEAFFINE +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | \ +; RUN: FileCheck %s -check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \ +; RUN: -polly-allow-nonaffine-branches=false < %s 2>&1 | \ +; RUN: FileCheck %s -check-prefix=NO-NONEAFFINE ; NONAFFINE-NOT: Statements diff --git a/polly/test/ScopInfo/bug_2010_10_22.ll b/polly/test/ScopInfo/bug_2010_10_22.ll index 1d248891dfd09..71e7051922b53 100644 --- a/polly/test/ScopInfo/bug_2010_10_22.ll +++ b/polly/test/ScopInfo/bug_2010_10_22.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/bug_2011_1_5.ll b/polly/test/ScopInfo/bug_2011_1_5.ll index 7c76c3eaa565a..f4a24e06f46ae 100644 --- a/polly/test/ScopInfo/bug_2011_1_5.ll +++ b/polly/test/ScopInfo/bug_2011_1_5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s ; Bug description: Alias Analysis thinks IntToPtrInst aliases with alloca instructions created by IndependentBlocks Pass. ; This will trigger the assertion when we are verifying the SCoP after IndependentBlocks. diff --git a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll index 6e1ef2339a81d..ed6bbafdac1f0 100644 --- a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll +++ b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | not FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | not FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @edge.8265 = external global [72 x i32], align 32 ; <ptr> [#uses=1] diff --git a/polly/test/ScopInfo/cfg_consequences.ll b/polly/test/ScopInfo/cfg_consequences.ll index 2b702e235ca6c..9161d3db4167a 100644 --- a/polly/test/ScopInfo/cfg_consequences.ll +++ b/polly/test/ScopInfo/cfg_consequences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void consequences(int *A, int bool_cond, int lhs, int rhs) { ; diff --git a/polly/test/ScopInfo/complex-branch-structure.ll b/polly/test/ScopInfo/complex-branch-structure.ll index f48089afb93b9..de79c2226e68d 100644 --- a/polly/test/ScopInfo/complex-branch-structure.ll +++ b/polly/test/ScopInfo/complex-branch-structure.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; We build a scop of the following form to check that the domain construction ; does not take a huge amount of time, but that we instead just bail out. diff --git a/polly/test/ScopInfo/complex-condition.ll b/polly/test/ScopInfo/complex-condition.ll index 9164959c1f6dc..c3b8d2bb0ef88 100644 --- a/polly/test/ScopInfo/complex-condition.ll +++ b/polly/test/ScopInfo/complex-condition.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Low complexity assumption: { : false } ; diff --git a/polly/test/ScopInfo/complex-expression.ll b/polly/test/ScopInfo/complex-expression.ll index 456edb04e0c2b..4a2a1d2a64a6d 100644 --- a/polly/test/ScopInfo/complex-expression.ll +++ b/polly/test/ScopInfo/complex-expression.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; ; This test case has an SCEVSMax expression with a very high arity. The ; piecewise affine function we would create for it would have a huge amount of diff --git a/polly/test/ScopInfo/complex-loop-nesting.ll b/polly/test/ScopInfo/complex-loop-nesting.ll index 4ffd8689f1a4a..36cb078f19fff 100644 --- a/polly/test/ScopInfo/complex-loop-nesting.ll +++ b/polly/test/ScopInfo/complex-loop-nesting.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/complex-successor-structure-2.ll b/polly/test/ScopInfo/complex-successor-structure-2.ll index 32425d7598bc9..f4a78bf753853 100644 --- a/polly/test/ScopInfo/complex-successor-structure-2.ll +++ b/polly/test/ScopInfo/complex-successor-structure-2.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; We build a scop for the region for.body->B13. The CFG is of the following ; form and the branch conditions are build from "smax" SCEVs. However, in diff --git a/polly/test/ScopInfo/complex-successor-structure-3.ll b/polly/test/ScopInfo/complex-successor-structure-3.ll index c01eca534bcf1..6da1fe3a8b9f3 100644 --- a/polly/test/ScopInfo/complex-successor-structure-3.ll +++ b/polly/test/ScopInfo/complex-successor-structure-3.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; ; Check that propagation of domains from A(X) to A(X+1) will keep the ; domains small and concise. diff --git a/polly/test/ScopInfo/complex-successor-structure.ll b/polly/test/ScopInfo/complex-successor-structure.ll index 1b39f4cf192eb..6c87ba3e98505 100644 --- a/polly/test/ScopInfo/complex-successor-structure.ll +++ b/polly/test/ScopInfo/complex-successor-structure.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; We build a scop from the region for.body->B13. The CFG is of the ; following form. The test checks that the condition construction does not take diff --git a/polly/test/ScopInfo/complex_domain_binary_condition.ll b/polly/test/ScopInfo/complex_domain_binary_condition.ll index 42a114eaa6ec1..6e28c9dfee06a 100644 --- a/polly/test/ScopInfo/complex_domain_binary_condition.ll +++ b/polly/test/ScopInfo/complex_domain_binary_condition.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Low complexity assumption: { : false } ; diff --git a/polly/test/ScopInfo/complex_execution_context.ll b/polly/test/ScopInfo/complex_execution_context.ll index 9896fba8904b8..9880a1dd67d19 100644 --- a/polly/test/ScopInfo/complex_execution_context.ll +++ b/polly/test/ScopInfo/complex_execution_context.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Low complexity assumption: ; diff --git a/polly/test/ScopInfo/cond_constant_in_loop.ll b/polly/test/ScopInfo/cond_constant_in_loop.ll index ecc2767fd6ecd..552fddc6ff08c 100644 --- a/polly/test/ScopInfo/cond_constant_in_loop.ll +++ b/polly/test/ScopInfo/cond_constant_in_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ;void f(long a[], long N, long M) { ; long i, j, k; diff --git a/polly/test/ScopInfo/cond_in_loop.ll b/polly/test/ScopInfo/cond_in_loop.ll index 0f31904133719..c06dcd955bac1 100644 --- a/polly/test/ScopInfo/cond_in_loop.ll +++ b/polly/test/ScopInfo/cond_in_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ;void f(long a[], long N, long M) { ; long i, j, k; diff --git a/polly/test/ScopInfo/condition-after-error-block-2.ll b/polly/test/ScopInfo/condition-after-error-block-2.ll index 257b2ede236d9..8c4b2170ad69b 100644 --- a/polly/test/ScopInfo/condition-after-error-block-2.ll +++ b/polly/test/ScopInfo/condition-after-error-block-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; Verify that we do not allow PHI nodes such as %phi, if they reference an error ; block and are used by anything else than a terminator instruction. diff --git a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll index d86b48ed24963..d5069da916fa1 100644 --- a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll +++ b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/ScopInfo/condtion-after-error-block.ll b/polly/test/ScopInfo/condtion-after-error-block.ll index 8ad98b4a4a78e..d9de4fc40a208 100644 --- a/polly/test/ScopInfo/condtion-after-error-block.ll +++ b/polly/test/ScopInfo/condtion-after-error-block.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; Verify that we allow scops containing uniform branch conditions, where all ; but one incoming block comes from an error condition. diff --git a/polly/test/ScopInfo/const_srem_sdiv.ll b/polly/test/ScopInfo/const_srem_sdiv.ll index b50c4bd910dda..b4c2f119fe053 100644 --- a/polly/test/ScopInfo/const_srem_sdiv.ll +++ b/polly/test/ScopInfo/const_srem_sdiv.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; ; See http://research.microsoft.com/pubs/151917/divmodnote-letter.pdf ; diff --git a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll index f09f82f32c93a..86dd94e3371b2 100644 --- a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll +++ b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; At some point this caused a problem in the domain generation as we ; assumed any constant branch condition to be valid. However, only constant diff --git a/polly/test/ScopInfo/constant_factor_in_parameter.ll b/polly/test/ScopInfo/constant_factor_in_parameter.ll index 26c73bd72271b..b58d413e074e7 100644 --- a/polly/test/ScopInfo/constant_factor_in_parameter.ll +++ b/polly/test/ScopInfo/constant_factor_in_parameter.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s ; ; Check that the constant part of the N * M * 4 expression is not part of the ; parameter but explicit in the access function. This can avoid existentially diff --git a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll index 762132f9edd78..62e6cd4641de1 100644 --- a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll +++ b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/polly/test/ScopInfo/constant_start_integer.ll b/polly/test/ScopInfo/constant_start_integer.ll index 6d17288b28227..8991f8250f0b7 100644 --- a/polly/test/ScopInfo/constant_start_integer.ll +++ b/polly/test/ScopInfo/constant_start_integer.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(float *input) { diff --git a/polly/test/ScopInfo/debug_call.ll b/polly/test/ScopInfo/debug_call.ll index 63c1baca5accc..a6761ecebe6a7 100644 --- a/polly/test/ScopInfo/debug_call.ll +++ b/polly/test/ScopInfo/debug_call.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-debug-func=dbg_printf '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-debug-func=dbg_printf '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Check that the call to dbg_printf is accepted as a debug-function. ; diff --git a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll index 7126fb95cd00c..676c8a27e5749 100644 --- a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll +++ b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void foo(long n, long m, long o, double A[n][m][o]) { ; for (long i = 0; i < n-3; i++) diff --git a/polly/test/ScopInfo/div_by_zero.ll b/polly/test/ScopInfo/div_by_zero.ll index 62a13de7ceac0..aecd16833b84e 100644 --- a/polly/test/ScopInfo/div_by_zero.ll +++ b/polly/test/ScopInfo/div_by_zero.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll index 333175b417ade..a3ca59563ab1f 100644 --- a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll +++ b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s ; Check that we do not crash on this input. Earlier this indeed crashed as ; we tried to model the access functions in an error block. diff --git a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll index b111851939d06..a988b3f8c2b01 100644 --- a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll +++ b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s ; ; void or(float *A, long n, long m) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/early_exit_for_complex_domains.ll b/polly/test/ScopInfo/early_exit_for_complex_domains.ll index 3ee6ff7889c84..9a1edcbfb7796 100644 --- a/polly/test/ScopInfo/early_exit_for_complex_domains.ll +++ b/polly/test/ScopInfo/early_exit_for_complex_domains.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s ; ; Check we do not crash. ; diff --git a/polly/test/ScopInfo/error-blocks-1.ll b/polly/test/ScopInfo/error-blocks-1.ll index 902ea15752980..047b095a95947 100644 --- a/polly/test/ScopInfo/error-blocks-1.ll +++ b/polly/test/ScopInfo/error-blocks-1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Context: ; CHECK-NEXT: [N] -> { : -2147483648 <= N <= 2147483647 } diff --git a/polly/test/ScopInfo/error-blocks-2.ll b/polly/test/ScopInfo/error-blocks-2.ll index 613b00a1a9ba7..6fa12947540c0 100644 --- a/polly/test/ScopInfo/error-blocks-2.ll +++ b/polly/test/ScopInfo/error-blocks-2.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/error-blocks-3.ll b/polly/test/ScopInfo/error-blocks-3.ll index 9521037888075..e7643601356db 100644 --- a/polly/test/ScopInfo/error-blocks-3.ll +++ b/polly/test/ScopInfo/error-blocks-3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-detect-keep-going -polly-allow-nonaffine -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-scops -polly-detect-keep-going -polly-allow-nonaffine -disable-output < %s | FileCheck %s ; ; The instruction ; diff --git a/polly/test/ScopInfo/escaping_empty_scop.ll b/polly/test/ScopInfo/escaping_empty_scop.ll index d47b2865b4ee0..2efaef3fb99b8 100644 --- a/polly/test/ScopInfo/escaping_empty_scop.ll +++ b/polly/test/ScopInfo/escaping_empty_scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void g(); ; int f(int *A) { diff --git a/polly/test/ScopInfo/exit-phi-1.ll b/polly/test/ScopInfo/exit-phi-1.ll index 21f13cf4f4e4d..cbd6c280e8caa 100644 --- a/polly/test/ScopInfo/exit-phi-1.ll +++ b/polly/test/ScopInfo/exit-phi-1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; Check for correct code generation of exit PHIs, even if the same PHI value ; is used again inside the the SCoP. diff --git a/polly/test/ScopInfo/exit-phi-2.ll b/polly/test/ScopInfo/exit-phi-2.ll index b8da9ab5b64f9..695c617b14c1f 100644 --- a/polly/test/ScopInfo/exit-phi-2.ll +++ b/polly/test/ScopInfo/exit-phi-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that there is no MK_ExitPHI READ access. ; diff --git a/polly/test/ScopInfo/exit_phi_accesses-2.ll b/polly/test/ScopInfo/exit_phi_accesses-2.ll index 928b564c7cef5..b3b7cb1c65993 100644 --- a/polly/test/ScopInfo/exit_phi_accesses-2.ll +++ b/polly/test/ScopInfo/exit_phi_accesses-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK-LABEL: Function: foo ; diff --git a/polly/test/ScopInfo/exit_phi_accesses.ll b/polly/test/ScopInfo/exit_phi_accesses.ll index a54ca4a185ae2..77b038ec8e4af 100644 --- a/polly/test/ScopInfo/exit_phi_accesses.ll +++ b/polly/test/ScopInfo/exit_phi_accesses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; Check that PHI nodes only create PHI access and nothing else (e.g. unnecessary ; SCALAR accesses). In this case, for a PHI in the exit node, hence there is no diff --git a/polly/test/ScopInfo/expensive-boundary-context.ll b/polly/test/ScopInfo/expensive-boundary-context.ll index c0d2dcd16289d..95212f83acdca 100644 --- a/polly/test/ScopInfo/expensive-boundary-context.ll +++ b/polly/test/ScopInfo/expensive-boundary-context.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \ +; RUN: < %s 2>&1 | FileCheck %s ; CHECK-NOT: Assumed Context: target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll index 2f446b630168a..5e833e7ae0f4f 100644 --- a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll +++ b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s ; CHECK: Valid Region for Scop: bb10 => bb16 diff --git a/polly/test/ScopInfo/full-function.ll b/polly/test/ScopInfo/full-function.ll index 20cb137181697..596c3d0af66a9 100644 --- a/polly/test/ScopInfo/full-function.ll +++ b/polly/test/ScopInfo/full-function.ll @@ -1,5 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s -check-prefix=FULL -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=WITHOUT-FULL +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-detect-full-functions < %s 2>&1 \ +; RUN: | FileCheck %s -check-prefix=FULL +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s -check-prefix=WITHOUT-FULL ; FULL: Region: %bb---FunctionExit ; FULL: Statements { diff --git a/polly/test/ScopInfo/granularity_same_name.ll b/polly/test/ScopInfo/granularity_same_name.ll index 638b09879ce39..17f75fbf8a979 100644 --- a/polly/test/ScopInfo/granularity_same_name.ll +++ b/polly/test/ScopInfo/granularity_same_name.ll @@ -1,7 +1,7 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=0 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=1 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB ; ; Check that the statement has the same name, regardless of how the ; basic block is split into multiple statements. diff --git a/polly/test/ScopInfo/granularity_scalar-indep.ll b/polly/test/ScopInfo/granularity_scalar-indep.ll index f4d864d2c6543..5c4484f9d4579 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Split a block into two independent statements that share no scalar. ; This case has the instructions of the two statements interleaved, such that diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll index f2c37f6293d62..7ae0d961b38fb 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out ; before the PHI WRITEs to ensure that the value when entering the block is diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll index f7bd882da96e2..7839e51c163ae 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out ; before the PHI WRITEs to ensure that the value when entering the block is diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll index 80aa9fb6deb7c..8643e85e05593 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Split a block into two independent statements that share no scalar. ; This case has an independent statement just for PHI writes. diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll index 66ef9fa9429e9..bc71cbe45cd98 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; Check that the PHI Write of value that is defined in the same basic ; block is in the statement where it is defined. diff --git a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll index 3837219e5d818..f3864bac519b9 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; This case has no explicit epilogue for PHI writes because it would ; have a scalar dependency to the previous statement. diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll index c43ad76d079d8..43101a8a0abfc 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; This case should be split into two statements because {X[0], Y[0]} ; and {A[0], B[0]} do not intersect. diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll index cfa7739d743f7..4974f7e9b28ca 100644 --- a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll +++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; ; This case cannot be split into two statements because the order of ; loads and store would be violated. diff --git a/polly/test/ScopInfo/i1_params.ll b/polly/test/ScopInfo/i1_params.ll index cf5b533c02682..be3e287372017 100644 --- a/polly/test/ScopInfo/i1_params.ll +++ b/polly/test/ScopInfo/i1_params.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that both a signed as well as an unsigned extended i1 parameter ; is represented correctly. diff --git a/polly/test/ScopInfo/infeasible-rtc.ll b/polly/test/ScopInfo/infeasible-rtc.ll index 9221ddf5fc910..7a0bfe0fa4d84 100644 --- a/polly/test/ScopInfo/infeasible-rtc.ll +++ b/polly/test/ScopInfo/infeasible-rtc.ll @@ -1,6 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s -check-prefix=DETECT -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s -check-prefix=SCOPS target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/infeasible_invalid_context.ll b/polly/test/ScopInfo/infeasible_invalid_context.ll index 7ab6477460721..006901ab05b79 100644 --- a/polly/test/ScopInfo/infeasible_invalid_context.ll +++ b/polly/test/ScopInfo/infeasible_invalid_context.ll @@ -1,6 +1,8 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s -check-prefix=DETECT -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s -check-prefix=SCOPS ; DETECT: Valid Region for Scop: if.end116 => for.inc216 ; SCOPS-NOT: Statements diff --git a/polly/test/ScopInfo/int2ptr_ptr2int.ll b/polly/test/ScopInfo/int2ptr_ptr2int.ll index adefe794561c2..578015aeecdc5 100644 --- a/polly/test/ScopInfo/int2ptr_ptr2int.ll +++ b/polly/test/ScopInfo/int2ptr_ptr2int.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=IR ; ; void f(long *A, long *ptr, long val) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll index a88fcdc0f9b12..627524c0327dd 100644 --- a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll +++ b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll @@ -1,5 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -passes=polly-codegen \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR ; ; void f(long *A, long *B, long *ptr, long val) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/integers.ll b/polly/test/ScopInfo/integers.ll index 5f89243be0e3b..4f6d1117e2bcc 100644 --- a/polly/test/ScopInfo/integers.ll +++ b/polly/test/ScopInfo/integers.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; Check that we correctly convert integers to isl values. diff --git a/polly/test/ScopInfo/inter-error-bb-dependence.ll b/polly/test/ScopInfo/inter-error-bb-dependence.ll index 0829f34be9791..761fcbbe3435e 100644 --- a/polly/test/ScopInfo/inter-error-bb-dependence.ll +++ b/polly/test/ScopInfo/inter-error-bb-dependence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 > /dev/null | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 > /dev/null | FileCheck %s ; ; Error statements (%bb33) do not require their uses to be verified. ; In this case it uses %tmp32 from %bb31 which is not available because diff --git a/polly/test/ScopInfo/inter_bb_scalar_dep.ll b/polly/test/ScopInfo/inter_bb_scalar_dep.ll index f6406640dd2d8..7313618b082bc 100644 --- a/polly/test/ScopInfo/inter_bb_scalar_dep.ll +++ b/polly/test/ScopInfo/inter_bb_scalar_dep.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; diff --git a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll index 3150204cd9549..d2ed3c17fe9dd 100644 --- a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll +++ b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ +; RUN: < %s 2>&1 | FileCheck %s ; CHECK: Statements { ; CHECK-NEXT: Stmt_loop__TO__backedge diff --git a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll index b0b63658caa55..b3286cd2a7240 100644 --- a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll +++ b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; diff --git a/polly/test/ScopInfo/intra_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_bb_scalar_dep.ll index 0ef6b2d35106b..86855e7499a51 100644 --- a/polly/test/ScopInfo/intra_bb_scalar_dep.ll +++ b/polly/test/ScopInfo/intra_bb_scalar_dep.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; diff --git a/polly/test/ScopInfo/intrinsics.ll b/polly/test/ScopInfo/intrinsics.ll index e17d06f753a21..e6d9e733e35bf 100644 --- a/polly/test/ScopInfo/intrinsics.ll +++ b/polly/test/ScopInfo/intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we remove the ignored intrinsics from the instruction list. ; diff --git a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll index d3439d8d33662..723942668d8c2 100644 --- a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll +++ b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s ; ; This crashed at some point as we place %1 and %4 in the same equivalence class ; for invariant loads and when we remap SCEVs to use %4 instead of %1 AddRec SCEVs diff --git a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll index ff5b0f601d03f..c493c22af32d9 100644 --- a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll +++ b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s ; ; Check that no invalidated iterator is accessed while elements from ; the list of MemoryAccesses are removed. diff --git a/polly/test/ScopInfo/invariant-load-instlist.ll b/polly/test/ScopInfo/invariant-load-instlist.ll index 1ec36e6d9d1b9..ecb80e4054c35 100644 --- a/polly/test/ScopInfo/invariant-load-instlist.ll +++ b/polly/test/ScopInfo/invariant-load-instlist.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s ; The load is a required invariant load and at the same time used in a store. ; Polly used to add two MemoryAccesses for it which caused an assertion to fail. diff --git a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll index 2d14287d4df44..89eac6ce69a11 100644 --- a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll +++ b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s ; CHECK: Statements { ; CHECK-NEXT: Stmt_L_4 diff --git a/polly/test/ScopInfo/invariant_load.ll b/polly/test/ScopInfo/invariant_load.ll index 8974b7f7fb8cb..9dc064276c40f 100644 --- a/polly/test/ScopInfo/invariant_load.ll +++ b/polly/test/ScopInfo/invariant_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll index 7b5a7591813a6..40aa3098683b3 100644 --- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll +++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; struct { ; int a; diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll index 0c2f57dfcb1c3..287676024079c 100644 --- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll +++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; struct { ; int a; diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll index 865bd789db6fb..cb745b4920b82 100644 --- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll +++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; int U; ; void f(int *A) { diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll index f63fe9cc1f7c6..fa5429d4803a8 100644 --- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll +++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; int U; ; int f(int *A) { diff --git a/polly/test/ScopInfo/invariant_load_addrec_sum.ll b/polly/test/ScopInfo/invariant_load_addrec_sum.ll index e70aa80ae6009..2e639f7d5e331 100644 --- a/polly/test/ScopInfo/invariant_load_addrec_sum.ll +++ b/polly/test/ScopInfo/invariant_load_addrec_sum.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Region: %entry.split---%if.end ; CHECK: Invariant Accesses: { diff --git a/polly/test/ScopInfo/invariant_load_base_pointer.ll b/polly/test/ScopInfo/invariant_load_base_pointer.ll index 1176d1ca9db85..f2539af97a0b7 100644 --- a/polly/test/ScopInfo/invariant_load_base_pointer.ll +++ b/polly/test/ScopInfo/invariant_load_base_pointer.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll index 81fd3b9559f43..f854b1f48ea92 100644 --- a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll +++ b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll index 7313176aceed7..5a9c5c6cabbe6 100644 --- a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll +++ b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_branch_condition.ll b/polly/test/ScopInfo/invariant_load_branch_condition.ll index f6cadffe311e8..d12750c30ba98 100644 --- a/polly/test/ScopInfo/invariant_load_branch_condition.ll +++ b/polly/test/ScopInfo/invariant_load_branch_condition.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll index 76cc55767caca..34d50a18663c4 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: | FileCheck %s ; CHECK: Stmt_body1 ; CHECK-NEXT: Domain := diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll index 9cc9391b6bc25..51f3cf6c095ac 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: | FileCheck %s ; Make sure we choose a canonical element that is not the first invariant load, ; but the first that is an array base pointer. diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll index 7f609f9a54689..3a742bbccdf19 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: | FileCheck %s ; Verify that we canonicalize accesses even tough one of the accesses (even ; the canonical base) has a partial execution context. This is correct as diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll index 216e0760987cd..6bd8b3146e871 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: | FileCheck %s ; Verify that a delinearized and a not delinearized access are not ; canonicalized. diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll index 5da3d0ceb2d0f..cb7e5646fc2b0 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: | FileCheck %s ; Verify that two arrays delinearized with different sizes are not coalesced. diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll index b71a092a2d468..6f7fbacc089cb 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: | FileCheck %s ; Verify that arrays with different element types are not coalesced. diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll index 2c4683ea5ce96..445832822bdf0 100644 --- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll +++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: | FileCheck %s ; Verify that nested arrays with invariant base pointers are handled correctly. ; Specifically, we currently do not canonicalize arrays where some accesses are diff --git a/polly/test/ScopInfo/invariant_load_complex_condition.ll b/polly/test/ScopInfo/invariant_load_complex_condition.ll index e6ea032004a96..11e7088d68dbd 100644 --- a/polly/test/ScopInfo/invariant_load_complex_condition.ll +++ b/polly/test/ScopInfo/invariant_load_complex_condition.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -S '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -S '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/invariant_load_condition.ll b/polly/test/ScopInfo/invariant_load_condition.ll index 8b1dc8be87c86..c7d7b3c9ba611 100644 --- a/polly/test/ScopInfo/invariant_load_condition.ll +++ b/polly/test/ScopInfo/invariant_load_condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_dereferenceable.ll b/polly/test/ScopInfo/invariant_load_dereferenceable.ll index fc5527c48c411..526bdc6ddb3bd 100644 --- a/polly/test/ScopInfo/invariant_load_dereferenceable.ll +++ b/polly/test/ScopInfo/invariant_load_dereferenceable.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect;scops>' -polly-print-detect -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: Function: foo_undereferanceable diff --git a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll index b5525a8e2639e..eb148063320e7 100644 --- a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll +++ b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not consolidate the invariant loads to smp[order - 1] and ; smp[order - 2] in the blocks %0 and %16. While they have the same pointer diff --git a/polly/test/ScopInfo/invariant_load_in_non_affine.ll b/polly/test/ScopInfo/invariant_load_in_non_affine.ll index 69a7932fd3f58..5261113f5a0cf 100644 --- a/polly/test/ScopInfo/invariant_load_in_non_affine.ll +++ b/polly/test/ScopInfo/invariant_load_in_non_affine.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; ; CHECK-NOT: Valid Region for Scop ; diff --git a/polly/test/ScopInfo/invariant_load_loop_ub.ll b/polly/test/ScopInfo/invariant_load_loop_ub.ll index 9258d75f6e294..ee889e6c4d5a1 100644 --- a/polly/test/ScopInfo/invariant_load_loop_ub.ll +++ b/polly/test/ScopInfo/invariant_load_loop_ub.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll index 50b0103b73efb..6af7caecc0b37 100644 --- a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll +++ b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; ; Note: The order of the invariant accesses is important because A is the ; base pointer of tmp3 and we will generate code in the same order as diff --git a/polly/test/ScopInfo/invariant_load_scalar_dep.ll b/polly/test/ScopInfo/invariant_load_scalar_dep.ll index ae1423e1e5f05..319f24bdcb920 100644 --- a/polly/test/ScopInfo/invariant_load_scalar_dep.ll +++ b/polly/test/ScopInfo/invariant_load_scalar_dep.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_load_stmt_domain.ll b/polly/test/ScopInfo/invariant_load_stmt_domain.ll index 8062d875b1174..715948062c055 100644 --- a/polly/test/ScopInfo/invariant_load_stmt_domain.ll +++ b/polly/test/ScopInfo/invariant_load_stmt_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; This test case verifies that the statement domain of the invariant access ; is the universe. In earlier versions of Polly, we accidentally computed an diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll index 9ee4a54168a68..a6108320d5608 100644 --- a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll +++ b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s +; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s ; ; Stress test for the code generation of invariant accesses. ; diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter.ll b/polly/test/ScopInfo/invariant_load_zext_parameter.ll index 5bd2c51d86fa6..e3c183aab5e26 100644 --- a/polly/test/ScopInfo/invariant_load_zext_parameter.ll +++ b/polly/test/ScopInfo/invariant_load_zext_parameter.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN ; ; void f(int *I0, int *I1, int *V) { ; for (int i = 0; i < 1000; i++) { diff --git a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll index 426c14c191dd1..b5168e912ed74 100644 --- a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll +++ b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s ; ; CHECK: Execution Context: [p_0_loaded_from_currpc] -> { : } ; diff --git a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll index 77f74df7d7b21..85360821078dc 100644 --- a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll +++ b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll index f18534d5bee24..134eac22bff5c 100644 --- a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll +++ b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Negative test. If we assume UB[*V] to be invariant we get a cyclic ; dependence in the invariant loads that needs to be resolved by diff --git a/polly/test/ScopInfo/invariant_loop_bounds.ll b/polly/test/ScopInfo/invariant_loop_bounds.ll index dcf7f50eb27c4..f22199cfe4942 100644 --- a/polly/test/ScopInfo/invariant_loop_bounds.ll +++ b/polly/test/ScopInfo/invariant_loop_bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll index df5798638ba7c..e3292b4e4aefa 100644 --- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll +++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we only have one parameter and one invariant load for all ; three loads that occur in the region but actually access the same diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll index 3d8c232c75970..d69438de5817f 100644 --- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll +++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we only have one parameter and one invariant load for all ; three loads that occur in the region but actually access the same diff --git a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll index 965531f20b01d..2df96faf76249 100644 --- a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll +++ b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s 2>&1 +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s 2>&1 ; Used to fail with: ; ../../isl/isl_aff.c:591: position out of bounds diff --git a/polly/test/ScopInfo/isl_trip_count_01.ll b/polly/test/ScopInfo/isl_trip_count_01.ll index 79621ce64bbcc..480b6e9574a66 100644 --- a/polly/test/ScopInfo/isl_trip_count_01.ll +++ b/polly/test/ScopInfo/isl_trip_count_01.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: [M, N] -> { Stmt_while_body[i0] : i0 > 0 and 4i0 <= -M + N; Stmt_while_body[0] }; ; diff --git a/polly/test/ScopInfo/isl_trip_count_02.ll b/polly/test/ScopInfo/isl_trip_count_02.ll index 3052299277844..b78fb838edd0f 100644 --- a/polly/test/ScopInfo/isl_trip_count_02.ll +++ b/polly/test/ScopInfo/isl_trip_count_02.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; TODO: We do not allow unbounded loops at the moment. ; diff --git a/polly/test/ScopInfo/isl_trip_count_03.ll b/polly/test/ScopInfo/isl_trip_count_03.ll index 52fde263d6898..96df05f89bcff 100644 --- a/polly/test/ScopInfo/isl_trip_count_03.ll +++ b/polly/test/ScopInfo/isl_trip_count_03.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Test comes from a bug (15771) or better a feature request. It was not allowed ; in Polly in the old domain generation as ScalarEvolution cannot figure out the diff --git a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll index 657b8f6dc64e1..fd310ececaa38 100644 --- a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll +++ b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/licm_load.ll b/polly/test/ScopInfo/licm_load.ll index 8f1cf4fa8fd91..ade640976d007 100644 --- a/polly/test/ScopInfo/licm_load.ll +++ b/polly/test/ScopInfo/licm_load.ll @@ -1,4 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes='loop(loop-rotate,indvars),polly-prepare,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s +; RUN: opt %loadNPMPolly -passes='loop-mssa(loop-rotate,indvars,licm),polly-prepare,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s ; ; void foo(int n, float A[static const restrict n], ; float B[static const restrict n], int j) { @@ -11,30 +14,26 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define void @foo(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %j) { entry: %tmp = sext i32 %n to i64 - %cmp1 = icmp slt i64 0, %tmp - br i1 %cmp1, label %for.body.lr.ph, label %for.end + br label %for.cond -for.body.lr.ph: ; preds = %entry +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %cmp = icmp slt i64 %indvars.iv, %tmp + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond %idxprom = sext i32 %j to i64 %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom %tmp2 = load i32, ptr %arrayidx, align 4 - br label %for.body - -for.body: ; preds = %for.body.lr.ph, %for.inc - %indvars.iv2 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ] - %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv2 + %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv store i32 %tmp2, ptr %arrayidx2, align 4 br label %for.inc for.inc: ; preds = %for.body - %indvars.iv.next = add nuw nsw i64 %indvars.iv2, 1 - %exitcond = icmp ne i64 %indvars.iv.next, %tmp - br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge - -for.cond.for.end_crit_edge: ; preds = %for.inc - br label %for.end + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond -for.end: ; preds = %for.cond.for.end_crit_edge, %entry +for.end: ; preds = %for.cond ret void } diff --git a/polly/test/ScopInfo/licm_potential_store.ll b/polly/test/ScopInfo/licm_potential_store.ll index cbd8e410ed7c8..8a36ee84313a2 100644 --- a/polly/test/ScopInfo/licm_potential_store.ll +++ b/polly/test/ScopInfo/licm_potential_store.ll @@ -1,4 +1,10 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -tailcallopt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NOLICM +; RUN: opt %loadNPMPolly -passes='sroa,instcombine,simplifycfg,reassociate,loop(loop-rotate),instcombine,indvars,polly-prepare,print<polly-function-scops>' \ +; RUN: -tailcallopt -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=NOLICM + +; RUN: opt %loadNPMPolly -passes='sroa,instcombine,simplifycfg,reassociate,loop(loop-rotate),instcombine,indvars,loop-mssa(licm),polly-prepare,print<polly-function-scops>' \ +; RUN: -tailcallopt -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=LICM ; void foo(int n, float A[static const restrict n], float x) { ; // (0) @@ -11,40 +17,67 @@ ; // (4) ; } +; LICM: Statements ; NOLICM: Statements target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define void @foo(i32 %n, ptr noalias nonnull %A, float %x) { entry: - %smax = call i32 @llvm.smax.i32(i32 %n, i32 0) - %0 = add nuw i32 %smax, 1 - br label %for.cond.1.preheader + %n.addr = alloca i32, align 4 + %A.addr = alloca ptr, align 8 + %x.addr = alloca float, align 4 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store i32 %n, ptr %n.addr, align 4 + store ptr %A, ptr %A.addr, align 8 + store float %x, ptr %x.addr, align 4 + %tmp = load i32, ptr %n.addr, align 4 + %tmp1 = zext i32 %tmp to i64 + store i32 0, ptr %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc.4, %entry + %tmp2 = load i32, ptr %i, align 4 + %cmp = icmp slt i32 %tmp2, 5 + br i1 %cmp, label %for.body, label %for.end.6 -for.cond.1.preheader: ; preds = %entry, %for.end - %i.05 = phi i32 [ 0, %entry ], [ %add5, %for.end ] - %x.addr.04 = phi float [ %x, %entry ], [ %x.addr.1.lcssa, %for.end ] +for.body: ; preds = %for.cond + store i32 0, ptr %j, align 4 br label %for.cond.1 -for.cond.1: ; preds = %for.cond.1, %for.cond.1.preheader - %x.addr.1 = phi float [ 7.000000e+00, %for.cond.1 ], [ %x.addr.04, %for.cond.1.preheader ] - %j.0 = phi i32 [ %add, %for.cond.1 ], [ 0, %for.cond.1.preheader ] - %add = add nuw i32 %j.0, 1 - %exitcond = icmp ne i32 %add, %0 - br i1 %exitcond, label %for.cond.1, label %for.end +for.cond.1: ; preds = %for.inc, %for.body + %tmp3 = load i32, ptr %j, align 4 + %tmp4 = load i32, ptr %n.addr, align 4 + %cmp2 = icmp slt i32 %tmp3, %tmp4 + br i1 %cmp2, label %for.body.3, label %for.end + +for.body.3: ; preds = %for.cond.1 + store float 7.000000e+00, ptr %x.addr, align 4 + br label %for.inc + +for.inc: ; preds = %for.body.3 + %tmp5 = load i32, ptr %j, align 4 + %add = add nsw i32 %tmp5, 1 + store i32 %add, ptr %j, align 4 + br label %for.cond.1 for.end: ; preds = %for.cond.1 - %x.addr.1.lcssa = phi float [ %x.addr.1, %for.cond.1 ] - store float %x.addr.1.lcssa, ptr %A, align 4 - %add5 = add nuw nsw i32 %i.05, 1 - %exitcond6 = icmp ne i32 %add5, 5 - br i1 %exitcond6, label %for.cond.1.preheader, label %for.end.6 + %tmp6 = load float, ptr %x.addr, align 4 + %tmp7 = load ptr, ptr %A.addr, align 8 + store float %tmp6, ptr %tmp7, align 4 + br label %for.inc.4 + +for.inc.4: ; preds = %for.end + %tmp8 = load i32, ptr %i, align 4 + %add5 = add nsw i32 %tmp8, 1 + store i32 %add5, ptr %i, align 4 + br label %for.cond -for.end.6: ; preds = %for.end +for.end.6: ; preds = %for.cond ret void } -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.smax.i32(i32, i32) #0 - -attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: Statements { +; CHECK: Stmt_for_end +; CHECK: } diff --git a/polly/test/ScopInfo/licm_potential_store_mssa.ll b/polly/test/ScopInfo/licm_potential_store_mssa.ll deleted file mode 100644 index ce785d622fcb3..0000000000000 --- a/polly/test/ScopInfo/licm_potential_store_mssa.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -tailcallopt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=LICM - -; void foo(int n, float A[static const restrict n], float x) { -; // (0) -; for (int i = 0; i < 5; i += 1) { -; for (int j = 0; j < n; j += 1) { -; x = 7; // (1) -; } -; A[0] = x; // (3) -; } -; // (4) -; } - -; LICM: Statements - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo(i32 %n, ptr noalias nonnull %A, float %x) { -entry: - %smax = call i32 @llvm.smax.i32(i32 %n, i32 0) - br label %for.cond.1.preheader - -for.cond.1.preheader: ; preds = %for.end, %entry - %i.05 = phi i32 [ 0, %entry ], [ %add5, %for.end ] - %x.addr.04 = phi float [ %x, %entry ], [ %x.addr.1.lcssa, %for.end ] - br label %for.cond.1 - -for.cond.1: ; preds = %for.cond.1, %for.cond.1.preheader - %x.addr.1 = phi float [ 7.000000e+00, %for.cond.1 ], [ %x.addr.04, %for.cond.1.preheader ] - %j.0 = phi i32 [ %add, %for.cond.1 ], [ 0, %for.cond.1.preheader ] - %add = add nuw i32 %j.0, 1 - %exitcond.not = icmp eq i32 %j.0, %smax - br i1 %exitcond.not, label %for.end, label %for.cond.1 - -for.end: ; preds = %for.cond.1 - %x.addr.1.lcssa = phi float [ %x.addr.1, %for.cond.1 ] - %add5 = add nuw nsw i32 %i.05, 1 - %exitcond6.not = icmp eq i32 %add5, 5 - br i1 %exitcond6.not, label %for.end.6, label %for.cond.1.preheader - -for.end.6: ; preds = %for.end - %x.addr.1.lcssa.lcssa = phi float [ %x.addr.1.lcssa, %for.end ] - store float %x.addr.1.lcssa.lcssa, ptr %A, align 4 - ret void -} - -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare i32 @llvm.smax.i32(i32, i32) #0 - -attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/polly/test/ScopInfo/licm_reduction_nested.ll b/polly/test/ScopInfo/licm_reduction_nested.ll index 50625b2ddabde..c1676033fa909 100644 --- a/polly/test/ScopInfo/licm_reduction_nested.ll +++ b/polly/test/ScopInfo/licm_reduction_nested.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars '-passes=polly-custom<prepare;scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -licm '-passes=polly-custom<prepare;scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -licm -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; XFAIL: * ; diff --git a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll index 8225bd04fce63..f102518da5261 100644 --- a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll +++ b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s ; Verify that the compilation of this test case does not take infinite time. ; At some point Polly tried to model this test case and got stuck in diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll index 064a0d3e700b9..e32748a4bbb57 100644 --- a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll +++ b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll index edaadd61dc020..b32b87b5c3f3a 100644 --- a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll +++ b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll index 391f0ec8c0f59..431c907857fec 100644 --- a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll +++ b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=IR ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/loop_affine_bound_0.ll b/polly/test/ScopInfo/loop_affine_bound_0.ll index fcd56613fc095..918d4099740ce 100644 --- a/polly/test/ScopInfo/loop_affine_bound_0.ll +++ b/polly/test/ScopInfo/loop_affine_bound_0.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long a[][128], long N, long M) { ; long i, j; diff --git a/polly/test/ScopInfo/loop_affine_bound_1.ll b/polly/test/ScopInfo/loop_affine_bound_1.ll index 392509871a9b7..8f7a87f1c5ac4 100644 --- a/polly/test/ScopInfo/loop_affine_bound_1.ll +++ b/polly/test/ScopInfo/loop_affine_bound_1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ;void f(long a[][128], long N, long M) { ; long i, j; diff --git a/polly/test/ScopInfo/loop_affine_bound_2.ll b/polly/test/ScopInfo/loop_affine_bound_2.ll index 665dc1ad244d9..2d9f997a0767f 100644 --- a/polly/test/ScopInfo/loop_affine_bound_2.ll +++ b/polly/test/ScopInfo/loop_affine_bound_2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long a[][128], long N, long M) { ; long i, j; diff --git a/polly/test/ScopInfo/loop_carry.ll b/polly/test/ScopInfo/loop_carry.ll index 579f43d874577..20ebbfbc8b49c 100644 --- a/polly/test/ScopInfo/loop_carry.ll +++ b/polly/test/ScopInfo/loop_carry.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/many-scalar-dependences.ll b/polly/test/ScopInfo/many-scalar-dependences.ll index ddad36065a5c8..5b003325ef0fb 100644 --- a/polly/test/ScopInfo/many-scalar-dependences.ll +++ b/polly/test/ScopInfo/many-scalar-dependences.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(float a[100][100]) { ; float x; diff --git a/polly/test/ScopInfo/max-loop-depth.ll b/polly/test/ScopInfo/max-loop-depth.ll index f33933210247d..71e9c02aa8dcc 100644 --- a/polly/test/ScopInfo/max-loop-depth.ll +++ b/polly/test/ScopInfo/max-loop-depth.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void bar(); ; void foo(int *A, int *B, long int N, long int M) { diff --git a/polly/test/ScopInfo/memcpy-raw-source.ll b/polly/test/ScopInfo/memcpy-raw-source.ll index 149a2fcfea772..6c45b0d41b76b 100644 --- a/polly/test/ScopInfo/memcpy-raw-source.ll +++ b/polly/test/ScopInfo/memcpy-raw-source.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa,scoped-noalias-aa,tbaa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa,scoped-noalias-aa,tbaa '-passes=print<polly-function-scops>' -disable-output < %s ; ; Ensure that ScopInfo's alias analysis llvm.memcpy for, ; like the AliasSetTracker, preserves bitcasts. diff --git a/polly/test/ScopInfo/memcpy.ll b/polly/test/ScopInfo/memcpy.ll index 6b7a9e2edffbe..95c455f097b21 100644 --- a/polly/test/ScopInfo/memcpy.ll +++ b/polly/test/ScopInfo/memcpy.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s ; ; CHECK: Arrays { ; CHECK-NEXT: i8 MemRef_A[*]; // Element size 1 diff --git a/polly/test/ScopInfo/memmove.ll b/polly/test/ScopInfo/memmove.ll index aba886b59d1d5..8ff471a11cd17 100644 --- a/polly/test/ScopInfo/memmove.ll +++ b/polly/test/ScopInfo/memmove.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s ; ; CHECK: Arrays { ; CHECK-NEXT: i8 MemRef_A[*]; // Element size 1 diff --git a/polly/test/ScopInfo/memset.ll b/polly/test/ScopInfo/memset.ll index 7eaec7bd1ad6a..89b0487728210 100644 --- a/polly/test/ScopInfo/memset.ll +++ b/polly/test/ScopInfo/memset.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -S -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s +; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -S -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s ; ; CHECK: Arrays { ; CHECK-NEXT: i8 MemRef_A[*]; // Element size 1 diff --git a/polly/test/ScopInfo/memset_null.ll b/polly/test/ScopInfo/memset_null.ll index 7bd3e90b3aa82..9755cf1129e68 100644 --- a/polly/test/ScopInfo/memset_null.ll +++ b/polly/test/ScopInfo/memset_null.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-modref-calls '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-modref-calls -S '-passes=polly<no-default-opts>' < %s +; RUN: opt %loadNPMPolly -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-modref-calls -S -passes=polly-codegen < %s ; ; Verify we can handle a memset to "null" and that we do not model it. ; TODO: FIXME: We could use the undefined memset to optimize the code further, diff --git a/polly/test/ScopInfo/mismatching-array-dimensions.ll b/polly/test/ScopInfo/mismatching-array-dimensions.ll index cd12421344f7f..f825cbff1ec56 100644 --- a/polly/test/ScopInfo/mismatching-array-dimensions.ll +++ b/polly/test/ScopInfo/mismatching-array-dimensions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK-NOT: AssumedContext diff --git a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll index 1e289425e86d7..6bc5f8d8eb73f 100644 --- a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll +++ b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll @@ -1,5 +1,7 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-allow-modref-calls -disable-output < %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb -passes=polly-codegen -polly-allow-modref-calls \ +; RUN: -disable-output < %s ; ; Verify that we model the may-write access of the prefetch intrinsic ; correctly, thus that A is accessed by it but B is not. diff --git a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll index 0b6e64da437fd..21322bc648f8e 100644 --- a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll +++ b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll @@ -1,5 +1,7 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -disable-output -polly-allow-modref-calls < %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \ +; RUN: -polly-allow-modref-calls < %s ; ; Verify that we model the read access of the gcread intrinsic ; correctly, thus that A is read by it but B is not. diff --git a/polly/test/ScopInfo/mod_ref_read_pointer.ll b/polly/test/ScopInfo/mod_ref_read_pointer.ll index 25d59d9f7fd16..25e56a08a961b 100644 --- a/polly/test/ScopInfo/mod_ref_read_pointer.ll +++ b/polly/test/ScopInfo/mod_ref_read_pointer.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls -passes=polly-codegen -disable-output < %s ; ; Check that we assume the call to func has a read on the whole A array. ; diff --git a/polly/test/ScopInfo/mod_ref_read_pointers.ll b/polly/test/ScopInfo/mod_ref_read_pointers.ll index f8cbb084aefe8..5cc96cf3a06eb 100644 --- a/polly/test/ScopInfo/mod_ref_read_pointers.ll +++ b/polly/test/ScopInfo/mod_ref_read_pointers.ll @@ -1,5 +1,7 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -disable-output -polly-allow-modref-calls < %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \ +; RUN: -polly-allow-modref-calls < %s ; ; Check that the call to func will "read" not only the A array but also the ; B array. The reason is the readonly annotation of func. diff --git a/polly/test/ScopInfo/modulo_zext_1.ll b/polly/test/ScopInfo/modulo_zext_1.ll index a9b53d53aea7e..0a8957da4931a 100644 --- a/polly/test/ScopInfo/modulo_zext_1.ll +++ b/polly/test/ScopInfo/modulo_zext_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/modulo_zext_2.ll b/polly/test/ScopInfo/modulo_zext_2.ll index f86ddcea9fe2b..7af2411e7e8c4 100644 --- a/polly/test/ScopInfo/modulo_zext_2.ll +++ b/polly/test/ScopInfo/modulo_zext_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/modulo_zext_3.ll b/polly/test/ScopInfo/modulo_zext_3.ll index 21596d16a6e14..1dac723aa2c23 100644 --- a/polly/test/ScopInfo/modulo_zext_3.ll +++ b/polly/test/ScopInfo/modulo_zext_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/multi-scop.ll b/polly/test/ScopInfo/multi-scop.ll index 8647d89c91d7a..c6dc1f201efa2 100644 --- a/polly/test/ScopInfo/multi-scop.ll +++ b/polly/test/ScopInfo/multi-scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; This test case contains two scops. diff --git a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll index 8785458e42f2c..bd46532d87f10 100644 --- a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll +++ b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll index 5de07bad6bd06..cdd46304c932b 100644 --- a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll +++ b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll index 984f41cd1e9bf..0b735b9106189 100644 --- a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll +++ b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll index 96b822ad4aa86..befca87972c19 100644 --- a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll +++ b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll @@ -1,5 +1,9 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE ; TODO: We should delinearize the accesses despite the use in a call to a ; readonly function. For now we verify we do not delinearize them though. diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll index c04cc200e06bd..cceb5353d74c0 100644 --- a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll +++ b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll @@ -1,5 +1,9 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE ; TODO: We should delinearize the accesses despite the use in a call to a ; readonly function. For now we verify we do not delinearize them though. diff --git a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll index 2abd37c9f82d0..c957dd10ed652 100644 --- a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll +++ b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o]) { diff --git a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll index 47cbc0bb1c534..4a1ee3b1af51d 100644 --- a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll +++ b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; #define N 400 ; diff --git a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll index e82869616d63c..9a6d8fbe12755 100644 --- a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll +++ b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Context: ; CHECK-NEXT: { : } diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim.ll b/polly/test/ScopInfo/multidim_fold_constant_dim.ll index dde847bb8d4d7..9f47694022868 100644 --- a/polly/test/ScopInfo/multidim_fold_constant_dim.ll +++ b/polly/test/ScopInfo/multidim_fold_constant_dim.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; struct com { ; double Real; diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll index 84222f73b7c6d..5778126ad8f17 100644 --- a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll +++ b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -debug -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -debug -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts diff --git a/polly/test/ScopInfo/multidim_fortran_2d.ll b/polly/test/ScopInfo/multidim_fortran_2d.ll index 10314606a8123..e5b005f17dcc7 100644 --- a/polly/test/ScopInfo/multidim_fortran_2d.ll +++ b/polly/test/ScopInfo/multidim_fortran_2d.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; subroutine init_array(ni, nj, pi, pj, a) ; implicit none diff --git a/polly/test/ScopInfo/multidim_fortran_2d_params.ll b/polly/test/ScopInfo/multidim_fortran_2d_params.ll index 992df969f9cc2..a7f7ebc130362 100644 --- a/polly/test/ScopInfo/multidim_fortran_2d_params.ll +++ b/polly/test/ScopInfo/multidim_fortran_2d_params.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-precise-fold-accesses -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-precise-fold-accesses \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; subroutine init_array(ni, nj, pi, pj, a) ; implicit none diff --git a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll index 79fd4c286745e..5f3080a12fdbe 100644 --- a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll +++ b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll @@ -1,5 +1,9 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE ; TODO: We should delinearize the accesses despite the use in a call to a ; readonly function. For now we verify we do not delinearize them though. diff --git a/polly/test/ScopInfo/multidim_fortran_srem.ll b/polly/test/ScopInfo/multidim_fortran_srem.ll index 62ff184f7a6b6..31cc633fa65c6 100644 --- a/polly/test/ScopInfo/multidim_fortran_srem.ll +++ b/polly/test/ScopInfo/multidim_fortran_srem.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" ; CHECK: Statements { diff --git a/polly/test/ScopInfo/multidim_gep_pointercast.ll b/polly/test/ScopInfo/multidim_gep_pointercast.ll index aa7932fb737f0..fd8048b11f14b 100644 --- a/polly/test/ScopInfo/multidim_gep_pointercast.ll +++ b/polly/test/ScopInfo/multidim_gep_pointercast.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; The load access to A has a pointer-bitcast to another elements size before the ; GetElementPtr. Verify that we do not the GEP delinearization because it diff --git a/polly/test/ScopInfo/multidim_gep_pointercast2.ll b/polly/test/ScopInfo/multidim_gep_pointercast2.ll index 0475506fa9f1a..9daae4b1ce3db 100644 --- a/polly/test/ScopInfo/multidim_gep_pointercast2.ll +++ b/polly/test/ScopInfo/multidim_gep_pointercast2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that we do not use the GetElementPtr information to delinearize A ; because of the cast in-between. Use the single-dimensional modeling instead. diff --git a/polly/test/ScopInfo/multidim_invalid_dimension.ll b/polly/test/ScopInfo/multidim_invalid_dimension.ll index 1cf79f1bd8de1..e1ec2e1ce3be0 100644 --- a/polly/test/ScopInfo/multidim_invalid_dimension.ll +++ b/polly/test/ScopInfo/multidim_invalid_dimension.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnueabi" diff --git a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll index 7779748c8c7f6..92b42a9e7a870 100644 --- a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll +++ b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o]) { diff --git a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll index 49e0a9b60657b..261cba1e68aad 100644 --- a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll +++ b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o], long p, long q, long r) { diff --git a/polly/test/ScopInfo/multidim_many_references.ll b/polly/test/ScopInfo/multidim_many_references.ll index a4edc9e725ac4..f0f1c2b1f39db 100644 --- a/polly/test/ScopInfo/multidim_many_references.ll +++ b/polly/test/ScopInfo/multidim_many_references.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/multidim_nested_start_integer.ll b/polly/test/ScopInfo/multidim_nested_start_integer.ll index c98aece41a9e1..6ee9798a050d7 100644 --- a/polly/test/ScopInfo/multidim_nested_start_integer.ll +++ b/polly/test/ScopInfo/multidim_nested_start_integer.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o]) { diff --git a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll index 12c8d97f5d63b..e238bddf4783b 100644 --- a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll +++ b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o]) { diff --git a/polly/test/ScopInfo/multidim_only_ivs_2d.ll b/polly/test/ScopInfo/multidim_only_ivs_2d.ll index a9685d12eb178..33b321716edc3 100644 --- a/polly/test/ScopInfo/multidim_only_ivs_2d.ll +++ b/polly/test/ScopInfo/multidim_only_ivs_2d.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; Derived from the following code: diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d.ll b/polly/test/ScopInfo/multidim_only_ivs_3d.ll index bb9c302eaf06a..39ea4243d9426 100644 --- a/polly/test/ScopInfo/multidim_only_ivs_3d.ll +++ b/polly/test/ScopInfo/multidim_only_ivs_3d.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long n, long m, long o, double A[n][m][o]) { diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll index 7f0c8b12be9ba..7f7f7f91067e2 100644 --- a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll +++ b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void foo(int n, int m, int o, double A[n][m][o]) { ; diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll index 797a037a6770e..1675110ffd6f1 100644 --- a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll +++ b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; This test case checks for array access functions where the order in which the diff --git a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll index 3a21702b36727..da9827fd5f2c6 100644 --- a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll +++ b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(long n, long m, float A[][n][m]) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/ScopInfo/multidim_param_in_subscript.ll b/polly/test/ScopInfo/multidim_param_in_subscript.ll index cc3fa87c8ba04..c86b5f0ae2386 100644 --- a/polly/test/ScopInfo/multidim_param_in_subscript.ll +++ b/polly/test/ScopInfo/multidim_param_in_subscript.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; ; void foo(long n, float A[][n]) { diff --git a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll index 117671ddc6a22..da563a05560cd 100644 --- a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll +++ b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(float *A, long *p) { ; for (long i = 0; i < 100; i++) diff --git a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll index 5ebe0daaec470..7059e5396987b 100644 --- a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll +++ b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll @@ -1,11 +1,11 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/multidim_srem.ll b/polly/test/ScopInfo/multidim_srem.ll index 5c1b0ea7e6150..88c8c6af648e0 100644 --- a/polly/test/ScopInfo/multidim_srem.ll +++ b/polly/test/ScopInfo/multidim_srem.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(long n, float A[][n][n]) { ; for (long i = 0; i < 200; i++) diff --git a/polly/test/ScopInfo/multidim_with_bitcast.ll b/polly/test/ScopInfo/multidim_with_bitcast.ll index 941ec637dba3d..0ab9c2d93ff46 100644 --- a/polly/test/ScopInfo/multidim_with_bitcast.ll +++ b/polly/test/ScopInfo/multidim_with_bitcast.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/multiple-binary-or-conditions.ll b/polly/test/ScopInfo/multiple-binary-or-conditions.ll index ecfc0012fd59f..65416e6fffda3 100644 --- a/polly/test/ScopInfo/multiple-binary-or-conditions.ll +++ b/polly/test/ScopInfo/multiple-binary-or-conditions.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s ; ; void or(float *A, long n, long m) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll index 9ae664fd497c8..910e624adb50a 100644 --- a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll +++ b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \ +; RUN: -polly-allow-differing-element-types \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; ; // For the following accesses the offset expression from the base pointer ; // is not always a multiple of the type size. diff --git a/polly/test/ScopInfo/multiple-types-non-affine-2.ll b/polly/test/ScopInfo/multiple-types-non-affine-2.ll index 6530dbf8d75be..cb0630da1b2e6 100644 --- a/polly/test/ScopInfo/multiple-types-non-affine-2.ll +++ b/polly/test/ScopInfo/multiple-types-non-affine-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -polly-allow-nonaffine -disable-output +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output ; ; // Check that accessing one array with different types works, ; // even though some accesses are non-affine. diff --git a/polly/test/ScopInfo/multiple-types-non-affine.ll b/polly/test/ScopInfo/multiple-types-non-affine.ll index 7f5f995fd6d26..7349c5ae48ba2 100644 --- a/polly/test/ScopInfo/multiple-types-non-affine.ll +++ b/polly/test/ScopInfo/multiple-types-non-affine.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -polly-allow-nonaffine -disable-output +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output ; ; // Check that accessing one array with different types works, ; // even though some accesses are non-affine. diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll index 5890a5a2ea3bf..df280c88f8668 100644 --- a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll +++ b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s ; ; void multiple_types(i8 *A) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll index 3e8390aad300f..b9494187d0ff3 100644 --- a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll +++ b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s ; ; void multiple_types(i8 *A) { ; for (long i = 0; i < 100; i++) { diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll index 4e71f9b5dd66b..e971ccc0ba448 100644 --- a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll +++ b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \ +; RUN: -polly-allow-differing-element-types \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; ; ; void foo(long n, long m, char A[][m]) { diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional.ll b/polly/test/ScopInfo/multiple-types-two-dimensional.ll index 9899fe4bde7ed..34179508cae89 100644 --- a/polly/test/ScopInfo/multiple-types-two-dimensional.ll +++ b/polly/test/ScopInfo/multiple-types-two-dimensional.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \ +; RUN: -polly-allow-differing-element-types \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(long n, long m, char A[][m]) { ; for (long i = 0; i < n; i++) diff --git a/polly/test/ScopInfo/multiple-types.ll b/polly/test/ScopInfo/multiple-types.ll index 753386575d33a..84d7d3349e29d 100644 --- a/polly/test/ScopInfo/multiple-types.ll +++ b/polly/test/ScopInfo/multiple-types.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \ +; RUN: -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s ; ; // Check that accessing one array with different types works. ; void multiple_types(char *Short, char *Float, char *Double) { diff --git a/polly/test/ScopInfo/multiple_exiting_blocks.ll b/polly/test/ScopInfo/multiple_exiting_blocks.ll index 218e5c4108c90..b0c425ee62cc4 100644 --- a/polly/test/ScopInfo/multiple_exiting_blocks.ll +++ b/polly/test/ScopInfo/multiple_exiting_blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll index d3a70fdb96130..ff0ec47be1c58 100644 --- a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll +++ b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/multiple_latch_blocks.ll b/polly/test/ScopInfo/multiple_latch_blocks.ll index 0aa25f4ad70f6..e5085daa2ca16 100644 --- a/polly/test/ScopInfo/multiple_latch_blocks.ll +++ b/polly/test/ScopInfo/multiple_latch_blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Domain := ; CHECK: [N, P] -> { Stmt_if_end[i0] : 0 <= i0 < N and (i0 > P or i0 < P) }; diff --git a/polly/test/ScopInfo/nested-loops.ll b/polly/test/ScopInfo/nested-loops.ll index 7998a3896d9d4..91002979f4fa4 100644 --- a/polly/test/ScopInfo/nested-loops.ll +++ b/polly/test/ScopInfo/nested-loops.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll index f1ad40baf33ea..df010846bed20 100644 --- a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll +++ b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not generate any scalar dependences regarding x. It is ; defined and used on the non-affine subregion only, thus we do not need diff --git a/polly/test/ScopInfo/non-affine-region-phi.ll b/polly/test/ScopInfo/non-affine-region-phi.ll index 0248004c27f50..3fb655e60f1c0 100644 --- a/polly/test/ScopInfo/non-affine-region-phi.ll +++ b/polly/test/ScopInfo/non-affine-region-phi.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -S < %s 2>&1 | FileCheck %s --check-prefix=CODE -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -S < %s 2>&1 | FileCheck %s --check-prefix=CODE +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify there is a phi in the non-affine region but it is not represented in ; the SCoP as all operands as well as the uses are inside the region too. diff --git a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll index 158fe772c6d29..4c3ca4d21447d 100644 --- a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll +++ b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops '-passes=print<polly-detect>,print<polly-function-scops>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Stmt_loop3 ; CHECK: Domain := diff --git a/polly/test/ScopInfo/non-affine-region-with-loop.ll b/polly/test/ScopInfo/non-affine-region-with-loop.ll index bcb542f2cbf70..f4c028ac23409 100644 --- a/polly/test/ScopInfo/non-affine-region-with-loop.ll +++ b/polly/test/ScopInfo/non-affine-region-with-loop.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly<no-default-opts>' -disable-output +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -passes=polly-codegen -disable-output ; ; CHECK: Domain := ; CHECK-NEXT: { Stmt_loop2__TO__loop[] }; diff --git a/polly/test/ScopInfo/non-precise-inv-load-1.ll b/polly/test/ScopInfo/non-precise-inv-load-1.ll index d100b514a0be3..d55344b355f13 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-1.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we do hoist the invariant access to I with a execution context ; as the address computation might wrap in the original but not in our diff --git a/polly/test/ScopInfo/non-precise-inv-load-2.ll b/polly/test/ScopInfo/non-precise-inv-load-2.ll index fad8fcd918446..79ef3b88cb4f0 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-2.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; ; CHECK: Invariant Accesses: { diff --git a/polly/test/ScopInfo/non-precise-inv-load-3.ll b/polly/test/ScopInfo/non-precise-inv-load-3.ll index d032644c9e5ff..aa92847661165 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-3.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/non-precise-inv-load-4.ll b/polly/test/ScopInfo/non-precise-inv-load-4.ll index c1ba7ddc62584..2a2241cb5a993 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-4.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we hoist I[0] without execution context even though it ; is executed in a statement with an invalid domain. diff --git a/polly/test/ScopInfo/non-precise-inv-load-5.ll b/polly/test/ScopInfo/non-precise-inv-load-5.ll index c188b5f74b1e9..a414c7c0fed17 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-5.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we do not hoist I[c] without execution context because it ; is executed in a statement with an invalid domain and it depends diff --git a/polly/test/ScopInfo/non-precise-inv-load-6.ll b/polly/test/ScopInfo/non-precise-inv-load-6.ll index b1c19745f1424..1300617f00eeb 100644 --- a/polly/test/ScopInfo/non-precise-inv-load-6.ll +++ b/polly/test/ScopInfo/non-precise-inv-load-6.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we model the execution context correctly. ; diff --git a/polly/test/ScopInfo/non-pure-function-call.ll b/polly/test/ScopInfo/non-pure-function-call.ll index ad69141a12c66..81d43db5c3522 100644 --- a/polly/test/ScopInfo/non-pure-function-call.ll +++ b/polly/test/ScopInfo/non-pure-function-call.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll index 38e1c03a35227..6cbb41041be88 100644 --- a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll +++ b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Error blocks are skipped during SCoP detection. We skip them during ; SCoP formation too as they might contain instructions we can not handle. diff --git a/polly/test/ScopInfo/non-pure-function-calls.ll b/polly/test/ScopInfo/non-pure-function-calls.ll index d45c32ede7088..f97644052272d 100644 --- a/polly/test/ScopInfo/non-pure-function-calls.ll +++ b/polly/test/ScopInfo/non-pure-function-calls.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Allow the user to define function names that are treated as ; error functions and assumed not to be executed. diff --git a/polly/test/ScopInfo/non_affine_access.ll b/polly/test/ScopInfo/non_affine_access.ll index 0f5d9e7c43e4e..0338edf053297 100644 --- a/polly/test/ScopInfo/non_affine_access.ll +++ b/polly/test/ScopInfo/non_affine_access.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" ; void foo(long *A) { diff --git a/polly/test/ScopInfo/non_affine_region_1.ll b/polly/test/ScopInfo/non_affine_region_1.ll index 5934962f81567..8980a711b325d 100644 --- a/polly/test/ScopInfo/non_affine_region_1.ll +++ b/polly/test/ScopInfo/non_affine_region_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify only the incoming scalar x is modeled as a read in the non-affine ; region. diff --git a/polly/test/ScopInfo/non_affine_region_2.ll b/polly/test/ScopInfo/non_affine_region_2.ll index aa083616cac8e..b2e072f7a3bfa 100644 --- a/polly/test/ScopInfo/non_affine_region_2.ll +++ b/polly/test/ScopInfo/non_affine_region_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify the scalar x defined in a non-affine subregion is written as it ; escapes the region. In this test the two conditionals inside the region diff --git a/polly/test/ScopInfo/non_affine_region_3.ll b/polly/test/ScopInfo/non_affine_region_3.ll index b7c4c1b9bd545..d850cb5c95aad 100644 --- a/polly/test/ScopInfo/non_affine_region_3.ll +++ b/polly/test/ScopInfo/non_affine_region_3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify the scalar x defined in a non-affine subregion is written as it ; escapes the region. In this test the two conditionals inside the region diff --git a/polly/test/ScopInfo/non_affine_region_4.ll b/polly/test/ScopInfo/non_affine_region_4.ll index 12cda0a53fb3b..c5309734a668e 100644 --- a/polly/test/ScopInfo/non_affine_region_4.ll +++ b/polly/test/ScopInfo/non_affine_region_4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify that both scalars (x and y) are properly written in the non-affine ; region and read afterwards. diff --git a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll index a52aae0d59168..b1ce00f0df94e 100644 --- a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll +++ b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Domain := ; CHECK-NEXT: { Stmt_while_cond_i__TO__while_end_i[] }; diff --git a/polly/test/ScopInfo/not-a-reduction.ll b/polly/test/ScopInfo/not-a-reduction.ll index 84f6564ae4a2e..3a961b2dc1719 100644 --- a/polly/test/ScopInfo/not-a-reduction.ll +++ b/polly/test/ScopInfo/not-a-reduction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | not FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s ;#define TYPE float ;#define NUM 4 diff --git a/polly/test/ScopInfo/opaque-struct.ll b/polly/test/ScopInfo/opaque-struct.ll index 23b9d3caf741d..f4f79525069e5 100644 --- a/polly/test/ScopInfo/opaque-struct.ll +++ b/polly/test/ScopInfo/opaque-struct.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s ; ; Check that we do not crash with unsized (opaque) types. ; diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll index e069ccac55340..eed27b1c4d9dd 100644 --- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll +++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s ; ; Check whether %newval is identified as escaping value, even though it is used ; in a phi that is in the region. Non-affine subregion case. diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll index 27ea11a23a3fe..44da399e704d8 100644 --- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll +++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: MustWriteAccess := [Reduction Type: NONE] [Scalar: 1] ; CHECK-NEXT: [p_0] -> { Stmt_bb3[] -> MemRef_tmp5[] }; diff --git a/polly/test/ScopInfo/parameter-constant-division.ll b/polly/test/ScopInfo/parameter-constant-division.ll index aaad0dfb2ee60..e5dd359158b8b 100644 --- a/polly/test/ScopInfo/parameter-constant-division.ll +++ b/polly/test/ScopInfo/parameter-constant-division.ll @@ -1,4 +1,6 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/parameter_in_dead_statement.ll b/polly/test/ScopInfo/parameter_in_dead_statement.ll index 444f9a9c24b4e..b295f17f628af 100644 --- a/polly/test/ScopInfo/parameter_in_dead_statement.ll +++ b/polly/test/ScopInfo/parameter_in_dead_statement.ll @@ -1,5 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -S \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR ; ; Verify we do not create assumptions based on the parameter p_1 which is the ; load %0 and due to error-assumptions not "part of the SCoP". diff --git a/polly/test/ScopInfo/parameter_product.ll b/polly/test/ScopInfo/parameter_product.ll index 9e6e3d0e1446e..2fe16f9d95f6d 100644 --- a/polly/test/ScopInfo/parameter_product.ll +++ b/polly/test/ScopInfo/parameter_product.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; int n, m; ; void foo(char* __restrict a) diff --git a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll index 20986d17b8f0d..6544aaec76f74 100644 --- a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll +++ b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the access function of the store is simple and concise ; diff --git a/polly/test/ScopInfo/partially_invariant_load_1.ll b/polly/test/ScopInfo/partially_invariant_load_1.ll index 8d62f156a4394..f3923f6127cdd 100644 --- a/polly/test/ScopInfo/partially_invariant_load_1.ll +++ b/polly/test/ScopInfo/partially_invariant_load_1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=IR +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=IR ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/partially_invariant_load_2.ll b/polly/test/ScopInfo/partially_invariant_load_2.ll index 48580907b2f0b..d0d74ad99e09b 100644 --- a/polly/test/ScopInfo/partially_invariant_load_2.ll +++ b/polly/test/ScopInfo/partially_invariant_load_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not try to preload *I and assume p != 42. ; diff --git a/polly/test/ScopInfo/phi-in-non-affine-region.ll b/polly/test/ScopInfo/phi-in-non-affine-region.ll index 6d98a6813862e..fbbc158b566bb 100644 --- a/polly/test/ScopInfo/phi-in-non-affine-region.ll +++ b/polly/test/ScopInfo/phi-in-non-affine-region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; Verify that 'tmp' is stored in bb1 and read by bb3, as it is needed as ; incoming value for the tmp11 PHI node. diff --git a/polly/test/ScopInfo/phi_after_error_block.ll b/polly/test/ScopInfo/phi_after_error_block.ll index 251be099c1f49..a1eadff3e9717 100644 --- a/polly/test/ScopInfo/phi_after_error_block.ll +++ b/polly/test/ScopInfo/phi_after_error_block.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s declare void @bar() diff --git a/polly/test/ScopInfo/phi_condition_modeling_1.ll b/polly/test/ScopInfo/phi_condition_modeling_1.ll index bd5c51e968ff5..a889ec96a4b12 100644 --- a/polly/test/ScopInfo/phi_condition_modeling_1.ll +++ b/polly/test/ScopInfo/phi_condition_modeling_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int c, int N) { ; int tmp; diff --git a/polly/test/ScopInfo/phi_condition_modeling_2.ll b/polly/test/ScopInfo/phi_condition_modeling_2.ll index 281b8d33b7756..b56b77e1f4534 100644 --- a/polly/test/ScopInfo/phi_condition_modeling_2.ll +++ b/polly/test/ScopInfo/phi_condition_modeling_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int c, int N) { ; int tmp; diff --git a/polly/test/ScopInfo/phi_conditional_simple_1.ll b/polly/test/ScopInfo/phi_conditional_simple_1.ll index 6d7f0e9484113..14fdc38201bc8 100644 --- a/polly/test/ScopInfo/phi_conditional_simple_1.ll +++ b/polly/test/ScopInfo/phi_conditional_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void jd(int *A, int c) { ; for (int i = 0; i < 1024; i++) { diff --git a/polly/test/ScopInfo/phi_loop_carried_float.ll b/polly/test/ScopInfo/phi_loop_carried_float.ll index 2e62dcd5799a3..76e5507f24b06 100644 --- a/polly/test/ScopInfo/phi_loop_carried_float.ll +++ b/polly/test/ScopInfo/phi_loop_carried_float.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; float f(float *A, int N) { ; float tmp = 0; diff --git a/polly/test/ScopInfo/phi_not_grouped_at_top.ll b/polly/test/ScopInfo/phi_not_grouped_at_top.ll index 57d02f24f781b..c97d9a27b24b7 100644 --- a/polly/test/ScopInfo/phi_not_grouped_at_top.ll +++ b/polly/test/ScopInfo/phi_not_grouped_at_top.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare>' -disable-output < %s +; RUN: opt %loadNPMPolly -passes=polly-prepare -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" declare i32 @funa() align 2 diff --git a/polly/test/ScopInfo/phi_scalar_simple_1.ll b/polly/test/ScopInfo/phi_scalar_simple_1.ll index 600c94e1d9b4c..ffd1a37f8a79f 100644 --- a/polly/test/ScopInfo/phi_scalar_simple_1.ll +++ b/polly/test/ScopInfo/phi_scalar_simple_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; The assumed context should be empty since the <nsw> flags on the IV ; increments already guarantee that there is no wrap in the loop trip diff --git a/polly/test/ScopInfo/phi_scalar_simple_2.ll b/polly/test/ScopInfo/phi_scalar_simple_2.ll index d3353ddc5e4e8..0d6d9029c61c3 100644 --- a/polly/test/ScopInfo/phi_scalar_simple_2.ll +++ b/polly/test/ScopInfo/phi_scalar_simple_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; int jd(int *restrict A, int x, int N, int c) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/phi_with_invoke_edge.ll b/polly/test/ScopInfo/phi_with_invoke_edge.ll index 1b01a98fca06a..9c98ec0c603cf 100644 --- a/polly/test/ScopInfo/phi_with_invoke_edge.ll +++ b/polly/test/ScopInfo/phi_with_invoke_edge.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" declare i32 @generic_personality_v0(i32, i64, ptr, ptr) diff --git a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll index 1b983ace1b6a4..18ba18c69f1f9 100644 --- a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll +++ b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int *B) { ; while (A != B) { diff --git a/polly/test/ScopInfo/pointer-comparison.ll b/polly/test/ScopInfo/pointer-comparison.ll index f80c4978669c4..846640ac630ff 100644 --- a/polly/test/ScopInfo/pointer-comparison.ll +++ b/polly/test/ScopInfo/pointer-comparison.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; TODO: FIXME: Investigate why we need a InvalidContext here. ; diff --git a/polly/test/ScopInfo/pointer-type-expressions.ll b/polly/test/ScopInfo/pointer-type-expressions.ll index 0fdd0bea6f219..89dce6536a107 100644 --- a/polly/test/ScopInfo/pointer-type-expressions.ll +++ b/polly/test/ScopInfo/pointer-type-expressions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], int N, float *P) { ; int i; diff --git a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll index 8ad531d93d290..7b6d0d542581b 100644 --- a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll +++ b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; In this test case we pass a pointer %A into a PHI node and also use this ; pointer as base pointer of an array store. As a result, we get both scalar diff --git a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll index 7dfa1ec7905ba..13087a517501a 100644 --- a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll +++ b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Statements { ; CHECK-NEXT: Stmt_bb9 diff --git a/polly/test/ScopInfo/pr38218.ll b/polly/test/ScopInfo/pr38218.ll index 2c22b1464876d..74103f9a2ac38 100644 --- a/polly/test/ScopInfo/pr38218.ll +++ b/polly/test/ScopInfo/pr38218.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s ; ; This code causes the SCoP to be rejected because of an ERRORBLOCK ; assumption and made Polly crash (llvm.org/PR38219). diff --git a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll index 800b0339a1422..33fa0126aa30e 100644 --- a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll +++ b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScopInfo/process_added_dimensions.ll b/polly/test/ScopInfo/process_added_dimensions.ll index 9cb932eeef18a..2d06f4b995976 100644 --- a/polly/test/ScopInfo/process_added_dimensions.ll +++ b/polly/test/ScopInfo/process_added_dimensions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Context: ; CHECK-NEXT: { : } diff --git a/polly/test/ScopInfo/pwaff-complexity-bailout.ll b/polly/test/ScopInfo/pwaff-complexity-bailout.ll index 62909f8c3e4c5..931e08fb8f2fc 100644 --- a/polly/test/ScopInfo/pwaff-complexity-bailout.ll +++ b/polly/test/ScopInfo/pwaff-complexity-bailout.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops '-pass-remarks-analysis=.*' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis=.* -disable-output < %s 2>&1 | FileCheck %s ; Make sure we hit the complexity bailout, and don't crash. ; CHECK: Low complexity assumption: { : false } diff --git a/polly/test/ScopInfo/ranged_parameter.ll b/polly/test/ScopInfo/ranged_parameter.ll index a6e51c7f2048c..03562b1fd1245 100644 --- a/polly/test/ScopInfo/ranged_parameter.ll +++ b/polly/test/ScopInfo/ranged_parameter.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the constraints on the parameter derived from the ; range metadata (see bottom of the file) are present: diff --git a/polly/test/ScopInfo/ranged_parameter_2.ll b/polly/test/ScopInfo/ranged_parameter_2.ll index 554dd6e38cd00..18cbbf3b87cd6 100644 --- a/polly/test/ScopInfo/ranged_parameter_2.ll +++ b/polly/test/ScopInfo/ranged_parameter_2.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true -debug < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true < %s \ +; RUN: -debug 2>&1 | FileCheck %s ; REQUIRES: asserts diff --git a/polly/test/ScopInfo/ranged_parameter_wrap.ll b/polly/test/ScopInfo/ranged_parameter_wrap.ll index 7ae15c34c94c6..d236eeeefc11c 100644 --- a/polly/test/ScopInfo/ranged_parameter_wrap.ll +++ b/polly/test/ScopInfo/ranged_parameter_wrap.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the constraints on the parameter derived from the ; __wrapping__ range metadata (see bottom of the file) are present: diff --git a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll index 00c3caa9c50ce..fc0a737a5edbe 100644 --- a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll +++ b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that the context is built fast and does not explode due to us ; combining a large number of non-convex ranges. Instead, after a certain diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll index 528dbb102ecb0..7e6f2406a0ac8 100644 --- a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll +++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; float foo(float sum, float A[]) { ; diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll index 6bc1fe71f35f2..18e6c1fac9e15 100644 --- a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll +++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; float foo(float sum, float A[]) { ; diff --git a/polly/test/ScopInfo/read-only-scalars.ll b/polly/test/ScopInfo/read-only-scalars.ll index 7c78d621930c5..f04163e480284 100644 --- a/polly/test/ScopInfo/read-only-scalars.ll +++ b/polly/test/ScopInfo/read-only-scalars.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALARS +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALARS ; CHECK-NOT: Memref_scalar diff --git a/polly/test/ScopInfo/read-only-statements.ll b/polly/test/ScopInfo/read-only-statements.ll index c1cb618a45f64..7bac53a2b6b51 100644 --- a/polly/test/ScopInfo/read-only-statements.ll +++ b/polly/test/ScopInfo/read-only-statements.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check we remove read only statements. ; diff --git a/polly/test/ScopInfo/reduction_alternating_base.ll b/polly/test/ScopInfo/reduction_alternating_base.ll index 474c6ac64ffc1..e38ff6046ac01 100644 --- a/polly/test/ScopInfo/reduction_alternating_base.ll +++ b/polly/test/ScopInfo/reduction_alternating_base.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; ; void f(int *A) { diff --git a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll index e91eeaf544a05..17f9dc57f2823 100644 --- a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll +++ b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: NONE ; diff --git a/polly/test/ScopInfo/reduction_different_index.ll b/polly/test/ScopInfo/reduction_different_index.ll index 5c169f71f4fe8..d2786d5fd6779 100644 --- a/polly/test/ScopInfo/reduction_different_index.ll +++ b/polly/test/ScopInfo/reduction_different_index.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; Verify if the following case is not detected as reduction. ; ; void f(int *A,int *sum) { diff --git a/polly/test/ScopInfo/reduction_different_index1.ll b/polly/test/ScopInfo/reduction_different_index1.ll index 93ab77be84de9..710ae3e74f21a 100644 --- a/polly/test/ScopInfo/reduction_different_index1.ll +++ b/polly/test/ScopInfo/reduction_different_index1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; Verify if the following case is not detected as reduction. ; ; void f(int *A, int *sum, int i1, int i2) { diff --git a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll index 618e4d3ab3f98..61228e075dabe 100644 --- a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll +++ b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll @@ -1,4 +1,4 @@ -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-disable-multiplicative-reductions -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-disable-multiplicative-reductions -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: ReadAccess := [Reduction Type: + ; CHECK: { Stmt_for_body[i0] -> MemRef_sum[0] }; diff --git a/polly/test/ScopInfo/reduction_double.ll b/polly/test/ScopInfo/reduction_double.ll index a7721d1b42e46..d126d3d833ee1 100644 --- a/polly/test/ScopInfo/reduction_double.ll +++ b/polly/test/ScopInfo/reduction_double.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s +; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s ; ; Verify if two independent reductions in same loop is detected ; diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate.ll b/polly/test/ScopInfo/reduction_escaping_intermediate.ll index 86923458ee773..c66a8be0852fa 100644 --- a/polly/test/ScopInfo/reduction_escaping_intermediate.ll +++ b/polly/test/ScopInfo/reduction_escaping_intermediate.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int N, int * restrict sums, int * restrict escape) { ; int i, j; diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll index 641d2e7337e77..c574d315b2fe1 100644 --- a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll +++ b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int N, int * restrict sums, int * restrict escape) { ; int i, j; diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll index dd2a76ebbd368..92a071ea1c372 100644 --- a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll +++ b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s ; ; void f(int N, int * restrict sums, int * restrict escape) { ; int i, j; diff --git a/polly/test/ScopInfo/reduction_if.ll b/polly/test/ScopInfo/reduction_if.ll index 53a62a3b857e9..4f7d3681e0a0b 100644 --- a/polly/test/ScopInfo/reduction_if.ll +++ b/polly/test/ScopInfo/reduction_if.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s +; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s ; ; Verify if reduction spread across multiple blocks in a single scop statement are detected ; diff --git a/polly/test/ScopInfo/reduction_indirect_access.ll b/polly/test/ScopInfo/reduction_indirect_access.ll index cb54cd9581368..7acac4b150f40 100644 --- a/polly/test/ScopInfo/reduction_indirect_access.ll +++ b/polly/test/ScopInfo/reduction_indirect_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -basic-aa -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s ; ; CHECK: Reduction Type: NONE ; CHECK: MemRef_INDICES[i0] diff --git a/polly/test/ScopInfo/reduction_indirect_access_2.ll b/polly/test/ScopInfo/reduction_indirect_access_2.ll index 5642a8470f124..331953991d86c 100644 --- a/polly/test/ScopInfo/reduction_indirect_access_2.ll +++ b/polly/test/ScopInfo/reduction_indirect_access_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s +; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s ; ; Validate that the accesses to INDICES[i] is not part of a reduction. ; diff --git a/polly/test/ScopInfo/reduction_invalid_different_operators.ll b/polly/test/ScopInfo/reduction_invalid_different_operators.ll index 9e6b3cd431083..9846f1029c087 100644 --- a/polly/test/ScopInfo/reduction_invalid_different_operators.ll +++ b/polly/test/ScopInfo/reduction_invalid_different_operators.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; int f() { ; int i, sum = 0, sth = 0; diff --git a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll index 7ae7d8ed3ffa2..4d70e53304556 100644 --- a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll +++ b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *sums) { ; int i, j; diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain.ll b/polly/test/ScopInfo/reduction_long_reduction_chain.ll index 6f2f48005bdac..62ae1fef187b6 100644 --- a/polly/test/ScopInfo/reduction_long_reduction_chain.ll +++ b/polly/test/ScopInfo/reduction_long_reduction_chain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s ; ; CHECK: Reduction Type: + ; CHECK: MemRef_sum diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll index 2fd71c28d5211..7ca46fa9535ac 100644 --- a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll +++ b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s ; ; Sum is added twice in the statement. Hence no reduction. ; CHECK: Reduction Type: NONE diff --git a/polly/test/ScopInfo/reduction_multiple_different_operators.ll b/polly/test/ScopInfo/reduction_multiple_different_operators.ll index 4f049a3505b09..b77c72a291744 100644 --- a/polly/test/ScopInfo/reduction_multiple_different_operators.ll +++ b/polly/test/ScopInfo/reduction_multiple_different_operators.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s +; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s ; ; Should not be identified as reduction as there are different operations ; involved on sum (multiplication followed by addition) diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll index 0d016674ffc08..800eb2043dc62 100644 --- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll +++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll @@ -1,4 +1,4 @@ -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Stmt_for_body ; CHECK: Reduction Type: * diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll index 568513aedfa10..49ebdcb044988 100644 --- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll +++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll @@ -1,4 +1,4 @@ -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Stmt_for_body ; CHECK: Reduction Type: NONE diff --git a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll index 0ac50b3b92c47..77b71f4df301b 100644 --- a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll +++ b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll @@ -1,4 +1,4 @@ -; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: ReadAccess := [Reduction Type: NONE ; CHECK: { Stmt_for_body[i0] -> MemRef_A[1 + i0] }; diff --git a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll index f01b641b17f64..61aaa051e49d1 100644 --- a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll +++ b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: + ; CHECK: Reduction Type: + diff --git a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll index 51685dca8b7da..fb6d236764b74 100644 --- a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll +++ b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: + ; diff --git a/polly/test/ScopInfo/reduction_simple_fp.ll b/polly/test/ScopInfo/reduction_simple_fp.ll index 67139bba2fded..aa4cd00f39f59 100644 --- a/polly/test/ScopInfo/reduction_simple_fp.ll +++ b/polly/test/ScopInfo/reduction_simple_fp.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Function: f_no_fast_math ; CHECK: Reduction Type: NONE diff --git a/polly/test/ScopInfo/reduction_simple_w_constant.ll b/polly/test/ScopInfo/reduction_simple_w_constant.ll index c17184624c066..e385b66f9db21 100644 --- a/polly/test/ScopInfo/reduction_simple_w_constant.ll +++ b/polly/test/ScopInfo/reduction_simple_w_constant.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: + ; diff --git a/polly/test/ScopInfo/reduction_simple_w_iv.ll b/polly/test/ScopInfo/reduction_simple_w_iv.ll index 7cc50bfe78906..e22eccbb2831d 100644 --- a/polly/test/ScopInfo/reduction_simple_w_iv.ll +++ b/polly/test/ScopInfo/reduction_simple_w_iv.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: + ; diff --git a/polly/test/ScopInfo/reduction_two_identical_reads.ll b/polly/test/ScopInfo/reduction_two_identical_reads.ll index 35cb9dfcdb122..8f00954f7efc3 100644 --- a/polly/test/ScopInfo/reduction_two_identical_reads.ll +++ b/polly/test/ScopInfo/reduction_two_identical_reads.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Reduction Type: NONE ; diff --git a/polly/test/ScopInfo/redundant_parameter_constraint.ll b/polly/test/ScopInfo/redundant_parameter_constraint.ll index 7512da420af0e..ad71f1f59e18b 100644 --- a/polly/test/ScopInfo/redundant_parameter_constraint.ll +++ b/polly/test/ScopInfo/redundant_parameter_constraint.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; The constraint that r2 has to be bigger than r1 is implicitly contained in ; the domain, hence we do not want to see it explicitly. diff --git a/polly/test/ScopInfo/region-with-instructions.ll b/polly/test/ScopInfo/region-with-instructions.ll index 38d58c97e1b05..d4720511b7aad 100644 --- a/polly/test/ScopInfo/region-with-instructions.ll +++ b/polly/test/ScopInfo/region-with-instructions.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Statements { ; CHECK: Stmt_bb46 diff --git a/polly/test/ScopInfo/remarks.ll b/polly/test/ScopInfo/remarks.ll index 2d6ace988659d..10cc57aa27a14 100644 --- a/polly/test/ScopInfo/remarks.ll +++ b/polly/test/ScopInfo/remarks.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ +; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: remark: test/ScopInfo/remarks.c:4:7: SCoP begins here. ; CHECK: remark: test/ScopInfo/remarks.c:9:15: Inbounds assumption: [N, M, Debug] -> { : M <= 100 } diff --git a/polly/test/ScopInfo/required-invariant-loop-bounds.ll b/polly/test/ScopInfo/required-invariant-loop-bounds.ll index 3bb5bfb0765e3..abf0b0e23855c 100644 --- a/polly/test/ScopInfo/required-invariant-loop-bounds.ll +++ b/polly/test/ScopInfo/required-invariant-loop-bounds.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \ +; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] diff --git a/polly/test/ScopInfo/restriction_in_dead_block.ll b/polly/test/ScopInfo/restriction_in_dead_block.ll index dd6115c421d0c..487c585cb9d9c 100644 --- a/polly/test/ScopInfo/restriction_in_dead_block.ll +++ b/polly/test/ScopInfo/restriction_in_dead_block.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we do not generate an empty invalid context only because the wrap ; in the second conditional will always happen if the block is executed. diff --git a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll index e8df1eccd5945..702b7dc5e0049 100644 --- a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll +++ b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll @@ -1,5 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s -check-prefix=DETECT +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; DETECT: Valid Region for Scop: bb124 => bb176 ; diff --git a/polly/test/ScopInfo/run-time-check-many-parameters.ll b/polly/test/ScopInfo/run-time-check-many-parameters.ll index 2a8853322f1d5..559c38d2682ef 100644 --- a/polly/test/ScopInfo/run-time-check-many-parameters.ll +++ b/polly/test/ScopInfo/run-time-check-many-parameters.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; A valid Scop would print the list of it's statements, we check that we do not ; see that list. diff --git a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll index 5e71e7a9d2a46..3cf4c40bdb60f 100644 --- a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll +++ b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll @@ -1,5 +1,6 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \ +; RUN: | FileCheck %s -check-prefix=DETECT +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; DETECT: Valid Region for Scop: for => return ; diff --git a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll index 286f878f935f4..51ab81476d542 100644 --- a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll +++ b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void foo(float *A, float *B, float *C, long N) { ; for (long i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll index 9f4d6f5895aeb..dd809ba156c79 100644 --- a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll +++ b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we introduce two ScopArrayInfo objects (or virtual arrays) for the %out variable ; as it is used as a memory base pointer (%0) but also as a scalar (%out.addr.0.lcssa). diff --git a/polly/test/ScopInfo/scalar.ll b/polly/test/ScopInfo/scalar.ll index db8371d96b118..812d2fddc3c8e 100644 --- a/polly/test/ScopInfo/scalar.ll +++ b/polly/test/ScopInfo/scalar.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" diff --git a/polly/test/ScopInfo/scalar_dependence_cond_br.ll b/polly/test/ScopInfo/scalar_dependence_cond_br.ll index a09bdaf06844e..59549f3dbbad5 100644 --- a/polly/test/ScopInfo/scalar_dependence_cond_br.ll +++ b/polly/test/ScopInfo/scalar_dependence_cond_br.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s ; ; void f(int *A, int c, int d) { ; for (int i = 0; i < 1024; i++) diff --git a/polly/test/ScopInfo/scalar_to_array.ll b/polly/test/ScopInfo/scalar_to_array.ll index e71c515fa2d35..3f61d0d723046 100644 --- a/polly/test/ScopInfo/scalar_to_array.ll +++ b/polly/test/ScopInfo/scalar_to_array.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ModuleID = 'scalar_to_array.ll' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll index 66c50dcbe13f3..fa0c81fe9a48e 100644 --- a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll +++ b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; Derived from test-suite/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c diff --git a/polly/test/ScopInfo/scev-invalidated.ll b/polly/test/ScopInfo/scev-invalidated.ll index e0956df0b1e84..6b9efd4b37c7d 100644 --- a/polly/test/ScopInfo/scev-invalidated.ll +++ b/polly/test/ScopInfo/scev-invalidated.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Region: %if.then6---%return ; diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll index 4a280cc929e3a..6e2ed1240b071 100644 --- a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll +++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll index 777c0088c4ddd..d0e8a2accaa2c 100644 --- a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll +++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll index 15dea5a7f4dd8..9ffc30f7360e9 100644 --- a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll +++ b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not build a SCoP and do not crash. ; diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll index 9ac6643564f7b..65f2f99b48c1b 100644 --- a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll +++ b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Check that we do not build a SCoP and do not crash. ; diff --git a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll index 1657d2f37d8ba..7c36f8d7f72e8 100644 --- a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll +++ b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s +; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s ; ; This test contains a infinite loop (bb13) and crashed the domain generation ; at some point. Just verify it does not anymore. diff --git a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll index 76bb438d43ff7..c8a234e9cbce7 100644 --- a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll +++ b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128" target triple = "aarch64--linux-android" diff --git a/polly/test/ScopInfo/sign_wrapped_set.ll b/polly/test/ScopInfo/sign_wrapped_set.ll index 135976e7d51c6..93b63df1c5841 100644 --- a/polly/test/ScopInfo/sign_wrapped_set.ll +++ b/polly/test/ScopInfo/sign_wrapped_set.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Domain := ; CHECK-NEXT: [srcHeight] -> { Stmt_for_cond6_preheader_us[i0] : 0 <= i0 <= -3 + srcHeight }; diff --git a/polly/test/ScopInfo/simple_loop_1.ll b/polly/test/ScopInfo/simple_loop_1.ll index 1d9f5c2edebcb..e736f3382d905 100644 --- a/polly/test/ScopInfo/simple_loop_1.ll +++ b/polly/test/ScopInfo/simple_loop_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], int N) { ; int i; diff --git a/polly/test/ScopInfo/simple_loop_2.ll b/polly/test/ScopInfo/simple_loop_2.ll index 877f860ba5a90..ae83dd633b96e 100644 --- a/polly/test/ScopInfo/simple_loop_2.ll +++ b/polly/test/ScopInfo/simple_loop_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], int N) { ; int i; diff --git a/polly/test/ScopInfo/simple_loop_unsigned.ll b/polly/test/ScopInfo/simple_loop_unsigned.ll index d3834297e2668..c4a96e4381c94 100644 --- a/polly/test/ScopInfo/simple_loop_unsigned.ll +++ b/polly/test/ScopInfo/simple_loop_unsigned.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], unsigned N) { ; unsigned i; diff --git a/polly/test/ScopInfo/simple_loop_unsigned_2.ll b/polly/test/ScopInfo/simple_loop_unsigned_2.ll index 1da6053a8316b..37e907dc006f3 100644 --- a/polly/test/ScopInfo/simple_loop_unsigned_2.ll +++ b/polly/test/ScopInfo/simple_loop_unsigned_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/simple_loop_unsigned_3.ll b/polly/test/ScopInfo/simple_loop_unsigned_3.ll index 0d44bf64ffc18..7f2cf5caa1ce7 100644 --- a/polly/test/ScopInfo/simple_loop_unsigned_3.ll +++ b/polly/test/ScopInfo/simple_loop_unsigned_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Assumed Context: ; CHECK-NEXT: [N] -> { : } diff --git a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll index f70b3fa3ea21a..4df0d343b0fc9 100644 --- a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll +++ b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | not FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @.str = private unnamed_addr constant [17 x i8] c"Random Value: %d\00", align 1 diff --git a/polly/test/ScopInfo/smax.ll b/polly/test/ScopInfo/smax.ll index 3ba2b35e7e503..8968e13192477 100644 --- a/polly/test/ScopInfo/smax.ll +++ b/polly/test/ScopInfo/smax.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32-S64" define void @foo(ptr noalias %data, ptr noalias %ptr, i32 %x_pos, i32 %w) { diff --git a/polly/test/ScopInfo/statistics.ll b/polly/test/ScopInfo/statistics.ll index aa72db3065259..0a294f2016eba 100644 --- a/polly/test/ScopInfo/statistics.ll +++ b/polly/test/ScopInfo/statistics.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -stats -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -stats -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; CHECK-DAG: 4 polly-scops - Maximal number of loops in scops diff --git a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll index 54832607f11d5..a46acb090b7fd 100644 --- a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll +++ b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Region__TO__Stmt diff --git a/polly/test/ScopInfo/stmt_split_no_after_split.ll b/polly/test/ScopInfo/stmt_split_no_after_split.ll index 0a4284bdd34f5..3a5ebf0725b10 100644 --- a/polly/test/ScopInfo/stmt_split_no_after_split.ll +++ b/polly/test/ScopInfo/stmt_split_no_after_split.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_split_no_dependence.ll b/polly/test/ScopInfo/stmt_split_no_dependence.ll index ed2180407c68d..9edd0f0a13e59 100644 --- a/polly/test/ScopInfo/stmt_split_no_dependence.ll +++ b/polly/test/ScopInfo/stmt_split_no_dependence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void func(int *A, int *B){ ; for (int i = 0; i < 1024; i+=1) { diff --git a/polly/test/ScopInfo/stmt_split_on_store.ll b/polly/test/ScopInfo/stmt_split_on_store.ll index f35a07c8d7176..d645becb19583 100644 --- a/polly/test/ScopInfo/stmt_split_on_store.ll +++ b/polly/test/ScopInfo/stmt_split_on_store.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=store -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=store -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void func(int *A, int *B){ ; for (int i = 0; i < 1024; i+=1) { diff --git a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll index 41721867f1764..1a1ccff4f02d6 100644 --- a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll +++ b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll index 0521525e272b3..594b36279d6bc 100644 --- a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll +++ b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll index 82a85aa5f0099..6c9f1c2cb5fd0 100644 --- a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll +++ b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll index 1f21c0ce7225f..07abe46ac0399 100644 --- a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll +++ b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_split_within_loop.ll b/polly/test/ScopInfo/stmt_split_within_loop.ll index 580ffab567846..9a42ae3a37270 100644 --- a/polly/test/ScopInfo/stmt_split_within_loop.ll +++ b/polly/test/ScopInfo/stmt_split_within_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Statements { ; CHECK-NEXT: Stmt_Stmt diff --git a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll index 67e8f631312ea..ba4801d9a0006 100644 --- a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll +++ b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s 2>&1 | FileCheck %s ; ; The statement Stmt_for_if_else_1 should be removed because it has no ; sideeffects. But it has a use of MemRef_tmp21 that must also be diff --git a/polly/test/ScopInfo/switch-1.ll b/polly/test/ScopInfo/switch-1.ll index 0f9e83210661b..0c3610185e6e0 100644 --- a/polly/test/ScopInfo/switch-1.ll +++ b/polly/test/ScopInfo/switch-1.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/switch-2.ll b/polly/test/ScopInfo/switch-2.ll index 9defd41f25231..f0056da37955d 100644 --- a/polly/test/ScopInfo/switch-2.ll +++ b/polly/test/ScopInfo/switch-2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/switch-3.ll b/polly/test/ScopInfo/switch-3.ll index faaa4d0254db9..a1810bf6ef538 100644 --- a/polly/test/ScopInfo/switch-3.ll +++ b/polly/test/ScopInfo/switch-3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/switch-4.ll b/polly/test/ScopInfo/switch-4.ll index c82e703a82965..00665fd75cbcd 100644 --- a/polly/test/ScopInfo/switch-4.ll +++ b/polly/test/ScopInfo/switch-4.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/switch-5.ll b/polly/test/ScopInfo/switch-5.ll index 5a49be8d80975..2de3695649404 100644 --- a/polly/test/ScopInfo/switch-5.ll +++ b/polly/test/ScopInfo/switch-5.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; The SCoP contains a loop with multiple exit blocks (BBs after leaving ; the loop). The current implementation of deriving their domain derives diff --git a/polly/test/ScopInfo/switch-6.ll b/polly/test/ScopInfo/switch-6.ll index 379981b167039..b859840ee111f 100644 --- a/polly/test/ScopInfo/switch-6.ll +++ b/polly/test/ScopInfo/switch-6.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int N) { ; for (int i = 0; i < N; i++) { diff --git a/polly/test/ScopInfo/switch-7.ll b/polly/test/ScopInfo/switch-7.ll index 0c8efc590b9c9..f73d97f70b28d 100644 --- a/polly/test/ScopInfo/switch-7.ll +++ b/polly/test/ScopInfo/switch-7.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST ; ; void f(int *A, int c, int N) { ; switch (c) { diff --git a/polly/test/ScopInfo/tempscop-printing.ll b/polly/test/ScopInfo/tempscop-printing.ll index 09cc95e42a584..4f02176569b73 100644 --- a/polly/test/ScopInfo/tempscop-printing.ll +++ b/polly/test/ScopInfo/tempscop-printing.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void f(long A[], int N, int *init_ptr) { ; long i, j; diff --git a/polly/test/ScopInfo/test-wrapping-in-condition.ll b/polly/test/ScopInfo/test-wrapping-in-condition.ll index d64bdf985c1d2..746350422d6b9 100644 --- a/polly/test/ScopInfo/test-wrapping-in-condition.ll +++ b/polly/test/ScopInfo/test-wrapping-in-condition.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invalid Context: ; CHECK: [N] -> { : N >= 129 } diff --git a/polly/test/ScopInfo/truncate-1.ll b/polly/test/ScopInfo/truncate-1.ll index d531dd8e5ab08..44222c88dfa77 100644 --- a/polly/test/ScopInfo/truncate-1.ll +++ b/polly/test/ScopInfo/truncate-1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(char *A, short N) { ; for (char i = 0; i < (char)N; i++) diff --git a/polly/test/ScopInfo/truncate-2.ll b/polly/test/ScopInfo/truncate-2.ll index 3f5d1faf4c377..c78a5337fdeba 100644 --- a/polly/test/ScopInfo/truncate-2.ll +++ b/polly/test/ScopInfo/truncate-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(char *A, short N) { ; for (short i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/truncate-3.ll b/polly/test/ScopInfo/truncate-3.ll index d20f375b9a2bd..5a80a873cd476 100644 --- a/polly/test/ScopInfo/truncate-3.ll +++ b/polly/test/ScopInfo/truncate-3.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \ +; RUN: -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Signed-unsigned restriction: [p] -> { : p <= -129 or p >= 128 } diff --git a/polly/test/ScopInfo/two-loops-one-infinite.ll b/polly/test/ScopInfo/two-loops-one-infinite.ll index aa2be1003adcc..e2723a8a9a2e9 100644 --- a/polly/test/ScopInfo/two-loops-one-infinite.ll +++ b/polly/test/ScopInfo/two-loops-one-infinite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Verify we do not create a SCoP in the presence of infinite loops. ; diff --git a/polly/test/ScopInfo/two-loops-right-after-each-other.ll b/polly/test/ScopInfo/two-loops-right-after-each-other.ll index 163642d9072e2..51f3c2d6eb875 100644 --- a/polly/test/ScopInfo/two-loops-right-after-each-other.ll +++ b/polly/test/ScopInfo/two-loops-right-after-each-other.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; CHECK: Statements { ; CHECK-NEXT: Stmt_loop_1 diff --git a/polly/test/ScopInfo/undef_in_cond.ll b/polly/test/ScopInfo/undef_in_cond.ll index 5fb08f82b3267..ef117612f6cb3 100644 --- a/polly/test/ScopInfo/undef_in_cond.ll +++ b/polly/test/ScopInfo/undef_in_cond.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define fastcc void @fix_operands() nounwind { diff --git a/polly/test/ScopInfo/unnamed_nonaffine.ll b/polly/test/ScopInfo/unnamed_nonaffine.ll index 11418499702df..5b9f980591777 100644 --- a/polly/test/ScopInfo/unnamed_nonaffine.ll +++ b/polly/test/ScopInfo/unnamed_nonaffine.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNNAMED +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNNAMED ; ; void f(int *A, int b) { ; int x; diff --git a/polly/test/ScopInfo/unnamed_stmts.ll b/polly/test/ScopInfo/unnamed_stmts.ll index e23b3ae5404b5..163170ce74895 100644 --- a/polly/test/ScopInfo/unnamed_stmts.ll +++ b/polly/test/ScopInfo/unnamed_stmts.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; This test case verifies that we generate numbered statement names in case ; no LLVM-IR names are used in the test case. We also verify, that we diff --git a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll index 5bc136658ccab..daa1f8c783870 100644 --- a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll +++ b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines ; Derived from test-suite/MultiSource/Applications/sgefa/blas.c ; ; The exit value of %i.0320 in land.rhs is not computable. diff --git a/polly/test/ScopInfo/unprofitable_scalar-accs.ll b/polly/test/ScopInfo/unprofitable_scalar-accs.ll index 3f6bb937ded1a..ca8daa4de01a6 100644 --- a/polly/test/ScopInfo/unprofitable_scalar-accs.ll +++ b/polly/test/ScopInfo/unprofitable_scalar-accs.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=HEURISTIC +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=HEURISTIC ; Check the effect of -polly-unprofitable-scalar-accs diff --git a/polly/test/ScopInfo/unsigned-condition.ll b/polly/test/ScopInfo/unsigned-condition.ll index 608b6d6e50a36..0529ded1f6cfb 100644 --- a/polly/test/ScopInfo/unsigned-condition.ll +++ b/polly/test/ScopInfo/unsigned-condition.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], int N, unsigned P) { ; int i; diff --git a/polly/test/ScopInfo/unsigned-division-1.ll b/polly/test/ScopInfo/unsigned-division-1.ll index 58d39dc239ac9..1c06b55300b67 100644 --- a/polly/test/ScopInfo/unsigned-division-1.ll +++ b/polly/test/ScopInfo/unsigned-division-1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, unsigned N) { ; for (unsigned i = 0; i < N / 2; i++) diff --git a/polly/test/ScopInfo/unsigned-division-2.ll b/polly/test/ScopInfo/unsigned-division-2.ll index cda666d6f5ebf..153639c42b384 100644 --- a/polly/test/ScopInfo/unsigned-division-2.ll +++ b/polly/test/ScopInfo/unsigned-division-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, unsigned N) { ; for (unsigned i = 0; i < N / 2 + 3; i++) diff --git a/polly/test/ScopInfo/unsigned-division-3.ll b/polly/test/ScopInfo/unsigned-division-3.ll index 50de3c59892e7..34561fc4645cc 100644 --- a/polly/test/ScopInfo/unsigned-division-3.ll +++ b/polly/test/ScopInfo/unsigned-division-3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, unsigned char N) { ; for (unsigned i = 0; i <= N / -128; i++) diff --git a/polly/test/ScopInfo/unsigned-division-4.ll b/polly/test/ScopInfo/unsigned-division-4.ll index 4dd75e526407d..be539b47123bc 100644 --- a/polly/test/ScopInfo/unsigned-division-4.ll +++ b/polly/test/ScopInfo/unsigned-division-4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, unsigned char N) { ; for (unsigned i = 0; i < (N / -128) + 3; i++) diff --git a/polly/test/ScopInfo/unsigned-division-5.ll b/polly/test/ScopInfo/unsigned-division-5.ll index fff131292271a..61716ecec0d90 100644 --- a/polly/test/ScopInfo/unsigned-division-5.ll +++ b/polly/test/ScopInfo/unsigned-division-5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, unsigned N) { ; for (unsigned i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/unsigned_wrap_uge.ll b/polly/test/ScopInfo/unsigned_wrap_uge.ll index f54b9bec6e7df..d25a9576e863a 100644 --- a/polly/test/ScopInfo/unsigned_wrap_uge.ll +++ b/polly/test/ScopInfo/unsigned_wrap_uge.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Unsigned wrap-around check. ; diff --git a/polly/test/ScopInfo/unsigned_wrap_ugt.ll b/polly/test/ScopInfo/unsigned_wrap_ugt.ll index 20afd17f86793..0310fdde6d26e 100644 --- a/polly/test/ScopInfo/unsigned_wrap_ugt.ll +++ b/polly/test/ScopInfo/unsigned_wrap_ugt.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Unsigned wrap-around check. ; diff --git a/polly/test/ScopInfo/unsigned_wrap_ule.ll b/polly/test/ScopInfo/unsigned_wrap_ule.ll index 6fa6cc12990a3..47bfc6065b1a8 100644 --- a/polly/test/ScopInfo/unsigned_wrap_ule.ll +++ b/polly/test/ScopInfo/unsigned_wrap_ule.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Unsigned wrap-around check. ; diff --git a/polly/test/ScopInfo/unsigned_wrap_ult.ll b/polly/test/ScopInfo/unsigned_wrap_ult.ll index 4a3b604d81f0f..1b73c0d6dd7ee 100644 --- a/polly/test/ScopInfo/unsigned_wrap_ult.ll +++ b/polly/test/ScopInfo/unsigned_wrap_ult.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; Unsigned wrap-around check. ; diff --git a/polly/test/ScopInfo/user_context.ll b/polly/test/ScopInfo/user_context.ll index ce8dd921cec16..74088120e4015 100644 --- a/polly/test/ScopInfo/user_context.ll +++ b/polly/test/ScopInfo/user_context.ll @@ -1,7 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-polly-context=[N] -> {: N = 1024}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CTX -; RUN: opt %loadNPMPolly '-polly-context=[N,M] -> {: 1 = 0}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-polly-context=[] -> {: 1 = 0}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-context='[N] -> {: N = 1024}' '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CTX +; RUN: opt %loadNPMPolly -polly-context='[N,M] -> {: 1 = 0}' '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-context='[] -> {: 1 = 0}' '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; void f(int a[], int N) { ; int i; diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll index c35ed9060e504..bd13ba8bb6961 100644 --- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll +++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; REMARK: remark: <unknown>:0:0: Use user assumption: [n, b] -> { : n <= 100 or (b = 0 and n >= 101) } ; diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll index 2afe99fd2c53b..45f59170942ed 100644 --- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll +++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Context: ; CHECK-NEXT: [n] -> { : -9223372036854775808 <= n <= 100 } diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll index 3479558062671..fb71c75aa75e4 100644 --- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll +++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; REMARK: remark: <unknown>:0:0: SCoP begins here. ; REMARK-NEXT: remark: <unknown>:0:0: Use user assumption: [n] -> { : n <= 100 } diff --git a/polly/test/ScopInfo/user_provided_assumptions.ll b/polly/test/ScopInfo/user_provided_assumptions.ll index 0bd99ea3fcb35..49b23b1e784dc 100644 --- a/polly/test/ScopInfo/user_provided_assumptions.ll +++ b/polly/test/ScopInfo/user_provided_assumptions.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP ; ; CHECK: remark: <unknown>:0:0: SCoP begins here. ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [M, N] -> { : N <= 2147483647 - M } diff --git a/polly/test/ScopInfo/user_provided_assumptions_2.ll b/polly/test/ScopInfo/user_provided_assumptions_2.ll index 1499ab98f7369..f8643b68cc63f 100644 --- a/polly/test/ScopInfo/user_provided_assumptions_2.ll +++ b/polly/test/ScopInfo/user_provided_assumptions_2.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP ; ; CHECK: remark: <unknown>:0:0: SCoP begins here. ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: { : } diff --git a/polly/test/ScopInfo/user_provided_assumptions_3.ll b/polly/test/ScopInfo/user_provided_assumptions_3.ll index aa1f72dddde9d..70f8f359e16cd 100644 --- a/polly/test/ScopInfo/user_provided_assumptions_3.ll +++ b/polly/test/ScopInfo/user_provided_assumptions_3.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP ; ; CHECK: remark: <unknown>:0:0: SCoP begins here. ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [N] -> { : N >= 2 } diff --git a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll index a6eed5df2063e..3e7883db48fcb 100644 --- a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll +++ b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-precise-inbounds -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ +; RUN: -polly-precise-inbounds -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: remark: <unknown>:0:0: SCoP begins here. ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [i, N, M] -> { : N <= i or (N > i and N >= 0) } @@ -17,7 +18,8 @@ ; -; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-precise-inbounds -disable-output -pass-remarks-output=%t.yaml < %s 2>&1 +; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \ +; RUN: -polly-precise-inbounds -disable-output < %s 2>&1 -pass-remarks-output=%t.yaml ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s ; YAML: --- !Analysis ; YAML: Pass: polly-scops diff --git a/polly/test/ScopInfo/variant_base_pointer.ll b/polly/test/ScopInfo/variant_base_pointer.ll index 36beaf5f0f016..32cb114fab05a 100644 --- a/polly/test/ScopInfo/variant_base_pointer.ll +++ b/polly/test/ScopInfo/variant_base_pointer.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -disable-output < %s +; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -passes=polly-codegen -disable-output < %s ; ; %tmp is added to the list of required hoists by -polly-scops and just ; assumed to be hoisted. Only -polly-scops recognizes it to be unhoistable diff --git a/polly/test/ScopInfo/variant_load_empty_domain.ll b/polly/test/ScopInfo/variant_load_empty_domain.ll index 5602c443b25d3..6a28bd0405fdd 100644 --- a/polly/test/ScopInfo/variant_load_empty_domain.ll +++ b/polly/test/ScopInfo/variant_load_empty_domain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invariant Accesses: { ; CHECK-NEXT: } diff --git a/polly/test/ScopInfo/wraping_signed_expr_0.ll b/polly/test/ScopInfo/wraping_signed_expr_0.ll index 3a663f57c2774..f5f06bfd7d336 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_0.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_0.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, char N, char p) { ; for (char i = 0; i < N; i++) { diff --git a/polly/test/ScopInfo/wraping_signed_expr_1.ll b/polly/test/ScopInfo/wraping_signed_expr_1.ll index 8963e86bc6157..e04257acc2010 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_1.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(long *A, long N, long p) { ; for (long i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/wraping_signed_expr_2.ll b/polly/test/ScopInfo/wraping_signed_expr_2.ll index 97cb2c05b16a0..2511c0d646086 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_2.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int N, int p) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/wraping_signed_expr_3.ll b/polly/test/ScopInfo/wraping_signed_expr_3.ll index 50e2eda2ce574..2106bdf4c0686 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_3.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_3.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(int *A, int N, int p) { ; for (int i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/wraping_signed_expr_4.ll b/polly/test/ScopInfo/wraping_signed_expr_4.ll index 4ddb43a01bf24..3ea17f6e266bf 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_4.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_4.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(char *A, char N, char p) { ; for (char i = 0; i < N; i++) diff --git a/polly/test/ScopInfo/wraping_signed_expr_5.ll b/polly/test/ScopInfo/wraping_signed_expr_5.ll index 440d32bab72a5..90706a3d3bc46 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_5.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_5.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; We should not generate runtime check for ((int)r1 + (int)r2) as it is known not ; to overflow. However (p + q) can, thus checks are needed. diff --git a/polly/test/ScopInfo/wraping_signed_expr_6.ll b/polly/test/ScopInfo/wraping_signed_expr_6.ll index 7bec9533440fb..9cf67fc101805 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_6.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_6.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invalid Context: ; CHECK: [N] -> { : N >= 129 } diff --git a/polly/test/ScopInfo/wraping_signed_expr_7.ll b/polly/test/ScopInfo/wraping_signed_expr_7.ll index 2d836e191f858..d18d2b2df3e12 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_7.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_7.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Invalid Context: ; CHECK: [N] -> { : N >= 129 } diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll index 4964a123d0be1..84626861bd39b 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; This checks that the no-wraps checks will be computed fast as some example ; already showed huge slowdowns even though the inbounds and nsw flags were diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll index a6db7c06d072c..b4dd567bafa6b 100644 --- a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll +++ b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; This checks that the no-wraps checks will be computed fast as some example ; already showed huge slowdowns even though the inbounds and nsw flags were diff --git a/polly/test/ScopInfo/zero_ext_of_truncate.ll b/polly/test/ScopInfo/zero_ext_of_truncate.ll index b509951bbf0d5..cbe4af05169f8 100644 --- a/polly/test/ScopInfo/zero_ext_of_truncate.ll +++ b/polly/test/ScopInfo/zero_ext_of_truncate.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(unsigned *restrict I, unsigned *restrict A, unsigned N, unsigned M) { ; for (unsigned i = 0; i < N; i++) { diff --git a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll index ea3356e01cc9f..b306045276765 100644 --- a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll +++ b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; void f(unsigned long *restrict I, unsigned *restrict A, unsigned N) { ; for (unsigned i = 0; i < N; i++) { diff --git a/polly/test/ScopInfo/zero_ext_space_mismatch.ll b/polly/test/ScopInfo/zero_ext_space_mismatch.ll index 9fd1afae4b889..3c02ae295b5ba 100644 --- a/polly/test/ScopInfo/zero_ext_space_mismatch.ll +++ b/polly/test/ScopInfo/zero_ext_space_mismatch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s ; ; CHECK: Assumed Context: ; CHECK-NEXT: [dim] -> { : dim > 0 } diff --git a/polly/test/ScopInliner/ignore-declares.ll b/polly/test/ScopInliner/ignore-declares.ll index 85198b728a9bb..5c0cfa103f0bf 100644 --- a/polly/test/ScopInliner/ignore-declares.ll +++ b/polly/test/ScopInliner/ignore-declares.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s +; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s ; Check that we do not crash if there are declares. We should skip function ; declarations and not try to query for domtree. diff --git a/polly/test/ScopInliner/invariant-load-func.ll b/polly/test/ScopInliner/invariant-load-func.ll index 6046fc0f38650..58c556a455fb9 100644 --- a/polly/test/ScopInliner/invariant-load-func.ll +++ b/polly/test/ScopInliner/invariant-load-func.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s 2>&1 | FileCheck %s ; Check that we inline a function that requires invariant load hoisting ; correctly. diff --git a/polly/test/ScopInliner/simple-inline-loop.ll b/polly/test/ScopInliner/simple-inline-loop.ll index 77a5ddda93adc..f12798a3d831a 100644 --- a/polly/test/ScopInliner/simple-inline-loop.ll +++ b/polly/test/ScopInliner/simple-inline-loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s 2>&1 | FileCheck %s ; Check that we get the 2 nested loops by inlining `to_be_inlined` into ; `inline_site`. diff --git a/polly/test/Simplify/coalesce_3partials.ll b/polly/test/Simplify/coalesce_3partials.ll index 5411b6e430c66..4112787e51bfa 100644 --- a/polly/test/Simplify/coalesce_3partials.ll +++ b/polly/test/Simplify/coalesce_3partials.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Combine 3 partial accesses into one. ; diff --git a/polly/test/Simplify/coalesce_disjointelements.ll b/polly/test/Simplify/coalesce_disjointelements.ll index 888daeff39d8d..b140f287e27f7 100644 --- a/polly/test/Simplify/coalesce_disjointelements.ll +++ b/polly/test/Simplify/coalesce_disjointelements.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Combine four partial stores into two. ; The stores write to the same array, but never the same element. diff --git a/polly/test/Simplify/coalesce_overlapping.ll b/polly/test/Simplify/coalesce_overlapping.ll index f492222461b34..ee716fc12f095 100644 --- a/polly/test/Simplify/coalesce_overlapping.ll +++ b/polly/test/Simplify/coalesce_overlapping.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Combine two partial stores (with overlapping domains) into one. ; diff --git a/polly/test/Simplify/coalesce_partial.ll b/polly/test/Simplify/coalesce_partial.ll index 4df91d43fc46d..aea691f43e934 100644 --- a/polly/test/Simplify/coalesce_partial.ll +++ b/polly/test/Simplify/coalesce_partial.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Combine two partial stores (with disjoint domains) into one. ; diff --git a/polly/test/Simplify/dead_access_load.ll b/polly/test/Simplify/dead_access_load.ll index 399c02381c890..66f94795ea6e4 100644 --- a/polly/test/Simplify/dead_access_load.ll +++ b/polly/test/Simplify/dead_access_load.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Remove a dead load-instruction ; (an load whose result is not used anywhere) diff --git a/polly/test/Simplify/dead_access_phi.ll b/polly/test/Simplify/dead_access_phi.ll index 9344a284b311a..fb40e4cc45b35 100644 --- a/polly/test/Simplify/dead_access_phi.ll +++ b/polly/test/Simplify/dead_access_phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Remove a dead PHI write/read pair ; (accesses that are effectively not used) diff --git a/polly/test/Simplify/dead_access_value.ll b/polly/test/Simplify/dead_access_value.ll index 6db242c97dac0..a8ff7f28542b7 100644 --- a/polly/test/Simplify/dead_access_value.ll +++ b/polly/test/Simplify/dead_access_value.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Remove a dead value write/read pair ; (accesses that are effectively not used) diff --git a/polly/test/Simplify/dead_instruction.ll b/polly/test/Simplify/dead_instruction.ll index 785b5ba154187..81e55e1c7bb30 100644 --- a/polly/test/Simplify/dead_instruction.ll +++ b/polly/test/Simplify/dead_instruction.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Remove a dead instruction ; (an instruction whose result is not used anywhere) diff --git a/polly/test/Simplify/emptyaccessdomain.ll b/polly/test/Simplify/emptyaccessdomain.ll index 917ae7f7d2c94..9b06cec965a9d 100644 --- a/polly/test/Simplify/emptyaccessdomain.ll +++ b/polly/test/Simplify/emptyaccessdomain.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; for (int j = 0; j < n; j += 1) { ; A[0] = 42.0; diff --git a/polly/test/Simplify/exit_phi_accesses-2.ll b/polly/test/Simplify/exit_phi_accesses-2.ll index d56fed4848ff3..379c7e0ace0a3 100644 --- a/polly/test/Simplify/exit_phi_accesses-2.ll +++ b/polly/test/Simplify/exit_phi_accesses-2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-scops -polly-print-simplify -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s | FileCheck %s ; ; The use of %sum.next by %phi counts as an escaping use. ; Don't remove the scalar write of %sum.next. diff --git a/polly/test/Simplify/func-b320a7.ll b/polly/test/Simplify/func-b320a7.ll index 65aa9cd28314e..5aa2caba95cfc 100644 --- a/polly/test/Simplify/func-b320a7.ll +++ b/polly/test/Simplify/func-b320a7.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<optree;simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>,polly-optree' -disable-output < %s | FileCheck %s -match-full-lines ; llvm.org/PR47098 ; Use-after-free by reference to Stmt remaining in InstStmtMap after removing it has been removed by Scop::simplifyScop. diff --git a/polly/test/Simplify/gemm.ll b/polly/test/Simplify/gemm.ll index 6e3a43e0ebbad..5120de2db7677 100644 --- a/polly/test/Simplify/gemm.ll +++ b/polly/test/Simplify/gemm.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s ; ; void gemm(float A[][1024], float B[][1024], float C[][1024]) { ; for (long i = 0; i < 1024; i++) diff --git a/polly/test/Simplify/nocoalesce_differentvalues.ll b/polly/test/Simplify/nocoalesce_differentvalues.ll index cba62549227ae..33d04b2f96de8 100644 --- a/polly/test/Simplify/nocoalesce_differentvalues.ll +++ b/polly/test/Simplify/nocoalesce_differentvalues.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Do not combine stores that write different values. ; diff --git a/polly/test/Simplify/nocoalesce_elementmismatch.ll b/polly/test/Simplify/nocoalesce_elementmismatch.ll index b589d13779e52..608b055e691df 100644 --- a/polly/test/Simplify/nocoalesce_elementmismatch.ll +++ b/polly/test/Simplify/nocoalesce_elementmismatch.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Do not combine stores that do not write to different elements in the ; same instance. diff --git a/polly/test/Simplify/nocoalesce_readbetween.ll b/polly/test/Simplify/nocoalesce_readbetween.ll index b61ad9d8031e0..e112b036cd778 100644 --- a/polly/test/Simplify/nocoalesce_readbetween.ll +++ b/polly/test/Simplify/nocoalesce_readbetween.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Do not combine stores if there is a read between them. ; Note: The read between is unused, so will be removed by markAndSweep. diff --git a/polly/test/Simplify/nocoalesce_writebetween.ll b/polly/test/Simplify/nocoalesce_writebetween.ll index be7d159554034..fd5eee52eaf5c 100644 --- a/polly/test/Simplify/nocoalesce_writebetween.ll +++ b/polly/test/Simplify/nocoalesce_writebetween.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Do not combine stores if there is a write between them. ; diff --git a/polly/test/Simplify/notdead_region_exitphi.ll b/polly/test/Simplify/notdead_region_exitphi.ll index 1bd9bfe10a99d..42fafb446cea3 100644 --- a/polly/test/Simplify/notdead_region_exitphi.ll +++ b/polly/test/Simplify/notdead_region_exitphi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Do not remove dependencies of a phi node in a region's exit block. ; diff --git a/polly/test/Simplify/notdead_region_innerphi.ll b/polly/test/Simplify/notdead_region_innerphi.ll index b59d6dc60b089..966448c9884b2 100644 --- a/polly/test/Simplify/notdead_region_innerphi.ll +++ b/polly/test/Simplify/notdead_region_innerphi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Do not remove dependencies of a phi node within a region statement (%phi). ; diff --git a/polly/test/Simplify/notredundant_region_loop.ll b/polly/test/Simplify/notredundant_region_loop.ll index 859bd459f72d6..88f6c41521739 100644 --- a/polly/test/Simplify/notredundant_region_loop.ll +++ b/polly/test/Simplify/notredundant_region_loop.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -disable-output < %s | FileCheck %s -match-full-lines ; ; Do not remove the store in region_entry. It can be executed multiple times ; due to being part of a non-affine loop. diff --git a/polly/test/Simplify/notredundant_region_middle.ll b/polly/test/Simplify/notredundant_region_middle.ll index a742ea889fb1f..43c05436809ba 100644 --- a/polly/test/Simplify/notredundant_region_middle.ll +++ b/polly/test/Simplify/notredundant_region_middle.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Do not remove redundant stores in the middle of region statements. ; The store in region_true could be removed, but in practice we do try to diff --git a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll index 8542b7927f860..8a9aec8be9e05 100644 --- a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll +++ b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Do not remove the scalar value write of %i.trunc in inner.for. ; It is used by body. diff --git a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll index 06b082c3f81fa..7218f328f9ca3 100644 --- a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll +++ b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-scops -polly-print-simplify -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s 2>&1 | FileCheck %s ; ; %tmp5 must keep the Value WRITE MemoryAccess, because as an incoming value of ; %tmp4, it is an "external use". diff --git a/polly/test/Simplify/overwritten.ll b/polly/test/Simplify/overwritten.ll index bc5b2dffd443d..eccdd8044d073 100644 --- a/polly/test/Simplify/overwritten.ll +++ b/polly/test/Simplify/overwritten.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s ; ; Remove a store that is overwritten by another store in the same statement. ; diff --git a/polly/test/Simplify/overwritten_3phi.ll b/polly/test/Simplify/overwritten_3phi.ll index 861c9acda3e9c..4cee4f13d26d0 100644 --- a/polly/test/Simplify/overwritten_3phi.ll +++ b/polly/test/Simplify/overwritten_3phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Remove identical writes ; (two stores in the same statement that write the same value to the same diff --git a/polly/test/Simplify/overwritten_3store.ll b/polly/test/Simplify/overwritten_3store.ll index cfd5a08143d60..c9f06c85dba53 100644 --- a/polly/test/Simplify/overwritten_3store.ll +++ b/polly/test/Simplify/overwritten_3store.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s ; ; Remove a store that is overwritten by another store in the same statement. ; Check that even multiple stores are removed. diff --git a/polly/test/Simplify/overwritten_implicit_and_explicit.ll b/polly/test/Simplify/overwritten_implicit_and_explicit.ll index 306e726e7808a..b1b7635e26263 100644 --- a/polly/test/Simplify/overwritten_implicit_and_explicit.ll +++ b/polly/test/Simplify/overwritten_implicit_and_explicit.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Remove a store that is overwritten by another store in the same statement. ; Check that this works even if one of the writes is a scalar MemoryKind. diff --git a/polly/test/Simplify/overwritten_loadbetween.ll b/polly/test/Simplify/overwritten_loadbetween.ll index 170838ddb8a1a..cdca2f11531e7 100644 --- a/polly/test/Simplify/overwritten_loadbetween.ll +++ b/polly/test/Simplify/overwritten_loadbetween.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s ; ; Do not remove overwrites when the value is read before. ; diff --git a/polly/test/Simplify/overwritten_scalar.ll b/polly/test/Simplify/overwritten_scalar.ll index a1e7da40554d5..700adb6aed2ec 100644 --- a/polly/test/Simplify/overwritten_scalar.ll +++ b/polly/test/Simplify/overwritten_scalar.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s ; ; Remove identical writes ; (two stores in the same statement that write the same value to the same diff --git a/polly/test/Simplify/pass_existence.ll b/polly/test/Simplify/pass_existence.ll index 6d9c99f9dc270..4d1d800b2a80b 100644 --- a/polly/test/Simplify/pass_existence.ll +++ b/polly/test/Simplify/pass_existence.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<simplify>' -polly-print-simplify -aa-pipeline=basic-aa < %s < %s | FileCheck %s +; RUN: opt %loadNPMPolly -disable-output "-passes=scop(print<polly-simplify>)" < %s -aa-pipeline=basic-aa < %s | FileCheck %s ; ; Simple test for the existence of the Simplify pass. ; diff --git a/polly/test/Simplify/phi_in_regionstmt.ll b/polly/test/Simplify/phi_in_regionstmt.ll index ba1cffee1a0df..2bb05738955a3 100644 --- a/polly/test/Simplify/phi_in_regionstmt.ll +++ b/polly/test/Simplify/phi_in_regionstmt.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; The PHINode %cond91.sink.sink.us.sink.6 is in the middle of a region ; statement. diff --git a/polly/test/Simplify/pr33323.ll b/polly/test/Simplify/pr33323.ll index 5130eb8488ca2..22921d5fba509 100644 --- a/polly/test/Simplify/pr33323.ll +++ b/polly/test/Simplify/pr33323.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s ; ; llvm.org/PR33323 ; diff --git a/polly/test/Simplify/redundant.ll b/polly/test/Simplify/redundant.ll index f2489a74eb899..540e537460e54 100644 --- a/polly/test/Simplify/redundant.ll +++ b/polly/test/Simplify/redundant.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Remove redundant store (a store that writes the same value already ; at the destination) diff --git a/polly/test/Simplify/redundant_differentindex.ll b/polly/test/Simplify/redundant_differentindex.ll index efd20e90ae748..5ce25836dedbd 100644 --- a/polly/test/Simplify/redundant_differentindex.ll +++ b/polly/test/Simplify/redundant_differentindex.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; A store that has a different index than the load it is storing is ; not redundant. diff --git a/polly/test/Simplify/redundant_partialwrite.ll b/polly/test/Simplify/redundant_partialwrite.ll index 357b63206b0f5..ac5ca907fff6f 100644 --- a/polly/test/Simplify/redundant_partialwrite.ll +++ b/polly/test/Simplify/redundant_partialwrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;simplify>' -polly-print-import-jscop -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadPolly -polly-import-jscop-postfix=transformed -polly-print-import-jscop -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines ; ; Remove a redundant store, if its partial domain is a subset of the ; read's domain. diff --git a/polly/test/Simplify/redundant_region.ll b/polly/test/Simplify/redundant_region.ll index c60d28b7039dd..927aac6c4af05 100644 --- a/polly/test/Simplify/redundant_region.ll +++ b/polly/test/Simplify/redundant_region.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; Remove redundant store (a store that writes the same value already ; at the destination) in a region. diff --git a/polly/test/Simplify/redundant_region_scalar.ll b/polly/test/Simplify/redundant_region_scalar.ll index 3de50c04b614f..72d570d46bdce 100644 --- a/polly/test/Simplify/redundant_region_scalar.ll +++ b/polly/test/Simplify/redundant_region_scalar.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; Remove redundant store (a store that writes the same value already ; at the destination) in a region. diff --git a/polly/test/Simplify/redundant_scalarwrite.ll b/polly/test/Simplify/redundant_scalarwrite.ll index 13ca40f8e1b87..84cb971be11fd 100644 --- a/polly/test/Simplify/redundant_scalarwrite.ll +++ b/polly/test/Simplify/redundant_scalarwrite.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; Remove redundant scalar stores. ; diff --git a/polly/test/Simplify/redundant_storebetween.ll b/polly/test/Simplify/redundant_storebetween.ll index 47d9cfde2d3ce..6540d7751e469 100644 --- a/polly/test/Simplify/redundant_storebetween.ll +++ b/polly/test/Simplify/redundant_storebetween.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines ; ; Don't remove store where there is another store to the same target ; in-between them. diff --git a/polly/test/Simplify/scalability1.ll b/polly/test/Simplify/scalability1.ll index 969aade275af2..c6e36f9dcdefb 100644 --- a/polly/test/Simplify/scalability1.ll +++ b/polly/test/Simplify/scalability1.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Test scalability. ; diff --git a/polly/test/Simplify/scalability2.ll b/polly/test/Simplify/scalability2.ll index 7951094867f2f..adcf9eef348a9 100644 --- a/polly/test/Simplify/scalability2.ll +++ b/polly/test/Simplify/scalability2.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines ; ; Test scalability. ; diff --git a/polly/test/Simplify/sweep_mapped_phi.ll b/polly/test/Simplify/sweep_mapped_phi.ll index ad41f2566e2b5..495d77a22f618 100644 --- a/polly/test/Simplify/sweep_mapped_phi.ll +++ b/polly/test/Simplify/sweep_mapped_phi.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; Map %phi to A[j], so the scalar write in Stmt_for_bodyA can be removed. ; diff --git a/polly/test/Simplify/sweep_mapped_value.ll b/polly/test/Simplify/sweep_mapped_value.ll index a50c013ac7917..c83941a8f0ba5 100644 --- a/polly/test/Simplify/sweep_mapped_value.ll +++ b/polly/test/Simplify/sweep_mapped_value.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines ; ; Map %val to A[j], so the scalar write on Stmt_for_bodyB can be removed. ; diff --git a/polly/test/Simplify/ununsed_read_in_region_entry.ll b/polly/test/Simplify/ununsed_read_in_region_entry.ll index 4c05de975fdf8..f2436c263a96a 100644 --- a/polly/test/Simplify/ununsed_read_in_region_entry.ll +++ b/polly/test/Simplify/ununsed_read_in_region_entry.ll @@ -1,5 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines -; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;simplify>' -S < %s | FileCheck %s -check-prefix=CODEGEN +; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>' -disable-output< %s | FileCheck %s -match-full-lines +; RUN: opt %loadNPMPolly '-passes=polly-simplify,polly-codegen' -S < %s | FileCheck %s -check-prefix=CODEGEN ; ; for (int i = 0; i < n; i+=1) { ; (void)A[0]; diff --git a/polly/test/Support/Plugins.ll b/polly/test/Support/Plugins.ll index b75dd872ad404..872a32fad4fed 100644 --- a/polly/test/Support/Plugins.ll +++ b/polly/test/Support/Plugins.ll @@ -1,4 +1,5 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;ast>' -polly-print-ast -S < %s | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -S < %s \ +; RUN: | FileCheck %s ; This testcase tests plugin registration. Check-lines below serve to verify ; that the passes actually ran. diff --git a/polly/test/Support/exportjson.ll b/polly/test/Support/exportjson.ll index 6bdf5a4c33cf3..22cfea23534cb 100644 --- a/polly/test/Support/exportjson.ll +++ b/polly/test/Support/exportjson.ll @@ -1,6 +1,6 @@ ; RUN: rm -rf %t ; RUN: mkdir -p %t -; RUN: opt %loadNPMPolly -polly-import-jscop-dir=%t '-passes=polly-custom<export-jscop>' -disable-output < %s +; RUN: opt %loadNPMPolly -polly-import-jscop-dir=%t -polly -O2 -polly-export -S < %s ; RUN: FileCheck %s -input-file %t/exportjson___%entry.split---%return.jscop ; ; for (int j = 0; j < n; j += 1) { @@ -9,22 +9,28 @@ ; define void @exportjson(i32 %n, ptr noalias nonnull %A) { entry: - br label %entry.split + br label %for -entry.split: - %j.cmp1 = icmp sgt i32 %n, 0 - br i1 %j.cmp1, label %body.lr.ph, label %return +for: + %j = phi i32 [0, %entry], [%j.inc, %inc] + %j.cmp = icmp slt i32 %j, %n + br i1 %j.cmp, label %body, label %exit -body.lr.ph: - store double 4.200000e+01, ptr %A, align 8 + body: + store double 42.0, ptr %A + br label %inc + +inc: + %j.inc = add nuw nsw i32 %j, 1 + br label %for + +exit: br label %return return: ret void } -attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) } - ; CHECK: { ; CHECK-NEXT: "arrays": [ diff --git a/polly/test/Support/isl-args.ll b/polly/test/Support/isl-args.ll index 6c8b2e97682e8..206cb73bfc5ab 100644 --- a/polly/test/Support/isl-args.ll +++ b/polly/test/Support/isl-args.ll @@ -1,7 +1,7 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP -; RUN: not opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-asdf < %s 2>&1 | FileCheck %s -match-full-lines --check-prefix=UNKNOWN -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP +; RUN: not opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-asdf < %s 2>&1| FileCheck %s -match-full-lines --check-prefix=UNKNOWN +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s ; VERSION: isl-{{.*}}-IMath-32 ; HELP: Usage: -polly-isl-arg [OPTION...] diff --git a/polly/test/Support/pipelineposition.ll b/polly/test/Support/pipelineposition.ll index 1ddfb5879ce16..a4506ba1d64ed 100644 --- a/polly/test/Support/pipelineposition.ll +++ b/polly/test/Support/pipelineposition.ll @@ -1,6 +1,8 @@ -; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=NOINLINE -; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -polly-run-inliner -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED1 -; RUN: opt %loadNPMPolly -O3 -polly -polly-position=before-vectorizer -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED3 +; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=NOINLINE +; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -polly-run-inliner -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED1 +; RUN: opt %loadNPMPolly -O3 -polly -polly-position=before-vectorizer -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED3 +; +; REQUIRES: asserts ; ; void callee(int n, double A[], int i) { ; for (int j = 0; j < n; j += 1) diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in index ca901b8825ced..f22063e796def 100644 --- a/polly/test/lit.site.cfg.in +++ b/polly/test/lit.site.cfg.in @@ -38,10 +38,14 @@ if config.llvm_polly_link_into_tools == '' or \ config.llvm_polly_link_into_tools.lower() == 'false' or \ config.llvm_polly_link_into_tools.lower() == 'notfound' or \ config.llvm_polly_link_into_tools.lower() == 'llvm_polly_link_into_tools-notfound': + config.substitutions.append(('%loadPolly', '-load ' + + config.polly_lib_dir + '/LLVMPolly@LLVM_SHLIBEXT@' + + commonOpts )) config.substitutions.append(('%loadNPMPolly', '-load-pass-plugin ' + config.polly_lib_dir + '/LLVMPolly@LLVM_SHLIBEXT@' + commonOpts )) else: + config.substitutions.append(('%loadPolly', commonOpts )) config.substitutions.append(('%loadNPMPolly', commonOpts )) import lit.llvm diff --git a/polly/test/polly.ll b/polly/test/polly.ll index 0f5467b0e654d..2e455b39a9cd4 100644 --- a/polly/test/polly.ll +++ b/polly/test/polly.ll @@ -1,4 +1,4 @@ -; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -S < %s 2>&1 | FileCheck %s +; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -S < %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @foo() nounwind { start: From bc08e69959ecefecc7ea41b648a659aa19c458c8 Mon Sep 17 00:00:00 2001 From: Naveen Seth Hanig <naveen.hanig@outlook.com> Date: Tue, 4 Nov 2025 07:24:56 +0530 Subject: [PATCH 110/313] [clang][modules] Fix crash in enum visibility lookup for C++20 header units (#166272) Fixes #165445. Fixes a crash when `ASTWriter::GenerateNameLookupTable` processes enum constants from C++20 header units. The special handling for enum constants, introduced in fccc6ee, doesn't account for declarations whose owning module is a C++20 header unit. It calls `isNamedModule()` on the result of `getTopLevelOwningNamedModule()`, which returns null for header units, causing a null pointer dereference. --- clang/lib/Serialization/ASTWriter.cpp | 3 +- ...rash-enum-visibility-with-header-unit.cppm | 46 +++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 clang/test/Modules/crash-enum-visibility-with-header-unit.cppm diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 3ac338e013deb..b1fd151790d96 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -4374,8 +4374,7 @@ class ASTDeclContextNameLookupTrait // parent of parent. We DON'T remove the enum constant from its parent. So // we don't need to care about merging problems here. if (auto *ECD = dyn_cast<EnumConstantDecl>(D); - ECD && DC.isFileContext() && ECD->getOwningModule() && - ECD->getTopLevelOwningNamedModule()->isNamedModule()) { + ECD && DC.isFileContext() && ECD->getTopLevelOwningNamedModule()) { if (llvm::all_of( DC.noload_lookup( cast<EnumDecl>(ECD->getDeclContext())->getDeclName()), diff --git a/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm new file mode 100644 index 0000000000000..90c57796dcf7e --- /dev/null +++ b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm @@ -0,0 +1,46 @@ +// Fixes #165445 + +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 -x c++-user-header %t/header.h \ +// RUN: -emit-header-unit -o %t/header.pcm +// +// RUN: %clang_cc1 -std=c++20 %t/A.cppm -fmodule-file=%t/header.pcm \ +// RUN: -emit-module-interface -o %t/A.pcm +// +// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fmodule-file=%t/header.pcm \ +// RUN: -emit-module-interface -o %t/B.pcm +// +// RUN: %clang_cc1 -std=c++20 %t/use.cpp \ +// RUN: -fmodule-file=A=%t/A.pcm -fmodule-file=B=%t/B.pcm \ +// RUN: -fmodule-file=%t/header.pcm \ +// RUN: -verify -fsyntax-only + +//--- enum.h +enum E { Value }; + +//--- header.h +#include "enum.h" + +//--- A.cppm +module; +#include "enum.h" +export module A; + +auto e = Value; + +//--- B.cppm +export module B; +import "header.h"; + +auto e = Value; + +//--- use.cpp +// expected-no-diagnostics +import A; +import B; +#include "enum.h" + +auto e = Value; From 03e78a9976cf7a313c841732432002cbb4146d10 Mon Sep 17 00:00:00 2001 From: Qiu Chaofan <qcf@ecnelises.com> Date: Tue, 4 Nov 2025 10:18:53 +0800 Subject: [PATCH 111/313] [Clang][Sema] Check null after ExtractTypeForDeductionGuide (#165776) --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 3 ++- clang/test/SemaTemplate/ctad.cpp | 12 ++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index db695d86b5416..2c575cd1d4289 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -505,6 +505,7 @@ Bug Fixes to C++ Support nontrivial member when another member has an initializer. (#GH81774) - Fixed a template depth issue when parsing lambdas inside a type constraint. (#GH162092) - Diagnose unresolved overload sets in non-dependent compound requirements. (#GH51246) (#GH97753) +- Fix a crash when extracting unavailable member type from alias in template deduction. (#GH165560) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index ad50600f6399c..bfcd3978817ca 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -659,7 +659,8 @@ struct ConvertConstructorToDeductionGuideTransform { SemaRef, MaterializedTypedefs, NestedPattern, TransformingOuterPatterns ? &Args : nullptr) .transform(NewDI); - + if (!NewDI) + return nullptr; // Resolving a wording defect, we also inherit default arguments from the // constructor. ExprResult NewDefArg; diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp index 1a575ea527006..60603f0c963a5 100644 --- a/clang/test/SemaTemplate/ctad.cpp +++ b/clang/test/SemaTemplate/ctad.cpp @@ -104,3 +104,15 @@ namespace ConvertDeducedTemplateArgument { auto x = C(D<A::B>()); } + +namespace pr165560 { +template <class T, class> struct S { + using A = T; + template <class> struct I { // expected-note{{candidate function template not viable: requires 1 argument, but 0 were provided}} \ + // expected-note{{implicit deduction guide declared as 'template <class> I(pr165560::S<int, int>::I<type-parameter-0-0>) -> pr165560::S<int, int>::I<type-parameter-0-0>'}} + I(typename A::F) {} // expected-error{{type 'A' (aka 'int') cannot be used prior to '::' because it has no members}} + }; +}; +S<int, int>::I i; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'S<int, int>::I'}} \ + // expected-note{{while building implicit deduction guide first needed here}} +} From 749a0e1e410b4c47de68dee295ad2d08283050c4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Mon, 3 Nov 2025 18:19:58 -0800 Subject: [PATCH 112/313] update_llc_test_checks: Recognize thumbv7k triples (#166270) --- llvm/utils/UpdateTestChecks/asm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py index 469e27facedb0..61f0d679f806d 100644 --- a/llvm/utils/UpdateTestChecks/asm.py +++ b/llvm/utils/UpdateTestChecks/asm.py @@ -576,6 +576,7 @@ def get_run_handler(triple): "armv7-apple-ios": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_IOS_RE), "armv7-apple-darwin": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE), "armv7k-apple-watchos": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE), + "thumbv7k-apple-watchos": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE), "thumb": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE), "thumb-macho": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE), "thumbv5-macho": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE), From 4d98ee2a22242bbe12f2fdb13ee49be697546259 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Mon, 3 Nov 2025 18:20:24 -0800 Subject: [PATCH 113/313] ARM: Add watchos run line to llvm.sincos test (#166271) --- llvm/test/CodeGen/ARM/llvm.sincos.ll | 221 +++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) diff --git a/llvm/test/CodeGen/ARM/llvm.sincos.ll b/llvm/test/CodeGen/ARM/llvm.sincos.ll index 0c2263ee9acbf..1448fac8d864f 100644 --- a/llvm/test/CodeGen/ARM/llvm.sincos.ll +++ b/llvm/test/CodeGen/ARM/llvm.sincos.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 < %s | FileCheck -check-prefix=GNUEABI %s ; RUN: llc -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-NO-STRET %s ; RUN: llc -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-WITH-STRET %s +; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefix=WATCHABI %s define { half, half } @test_sincos_f16(half %a) { ; GNU-LABEL: test_sincos_f16: @@ -75,6 +76,23 @@ define { half, half } @test_sincos_f16(half %a) { ; IOS-WITH-STRET-NEXT: mov r0, r5 ; IOS-WITH-STRET-NEXT: add sp, sp, #8 ; IOS-WITH-STRET-NEXT: pop {r4, r5, pc} +; +; WATCHABI-LABEL: test_sincos_f16: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s0 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: vcvtb.f16.f32 s1, s1 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { half, half } @llvm.sincos.f16(half %a) ret { half, half } %result } @@ -130,6 +148,22 @@ define half @test_sincos_f16_only_use_sin(half %a) { ; IOS-WITH-STRET-NEXT: add sp, sp, #8 ; IOS-WITH-STRET-NEXT: pop {lr} ; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f16_only_use_sin: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s0 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { half, half } @llvm.sincos.f16(half %a) %result.0 = extractvalue { half, half } %result, 0 ret half %result.0 @@ -186,6 +220,22 @@ define half @test_sincos_f16_only_use_cos(half %a) { ; IOS-WITH-STRET-NEXT: add sp, sp, #8 ; IOS-WITH-STRET-NEXT: pop {lr} ; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f16_only_use_cos: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s0 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s1 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { half, half } @llvm.sincos.f16(half %a) %result.1 = extractvalue { half, half } %result, 1 ret half %result.1 @@ -366,6 +416,54 @@ define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) { ; IOS-WITH-STRET-NEXT: add sp, sp, #24 ; IOS-WITH-STRET-NEXT: vpop {d8} ; IOS-WITH-STRET-NEXT: pop {r4, r5, pc} +; +; WATCHABI-LABEL: test_sincos_v2f16: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: vpush {d10} +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vpush {d8} +; WATCHABI-NEXT: .cfi_def_cfa_offset 24 +; WATCHABI-NEXT: .cfi_offset d10, -16 +; WATCHABI-NEXT: .cfi_offset d8, -24 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 32 +; WATCHABI-NEXT: vmov.f32 s16, s0 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s1 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: vcvtb.f32.f16 s4, s16 +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vmov.f32 s0, s4 +; WATCHABI-NEXT: vmov.f32 s20, s1 +; WATCHABI-NEXT: strh.w r0, [sp, #6] +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s20 +; WATCHABI-NEXT: strh.w r0, [sp, #4] +; WATCHABI-NEXT: add r0, sp, #4 +; WATCHABI-NEXT: vld1.32 {d16[0]}, [r0:32] +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s1 +; WATCHABI-NEXT: strh.w r0, [sp, #2] +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vmovl.u16 q0, d16 +; WATCHABI-NEXT: strh.w r0, [sp] +; WATCHABI-NEXT: mov r0, sp +; WATCHABI-NEXT: vld1.32 {d18[0]}, [r0:32] +; WATCHABI-NEXT: vmovl.u16 q1, d18 +; WATCHABI-NEXT: vmov.f32 s2, s4 +; WATCHABI-NEXT: vmov.f32 s3, s5 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: vpop {d8} +; WATCHABI-NEXT: vpop {d10} +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a) ret { <2 x half>, <2 x half> } %result } @@ -416,6 +514,20 @@ define { float, float } @test_sincos_f32(float %a) { ; IOS-WITH-STRET-NEXT: pop {r0, r1} ; IOS-WITH-STRET-NEXT: pop {lr} ; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f32: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { float, float } @llvm.sincos.f32(float %a) ret { float, float } %result } @@ -519,6 +631,33 @@ define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) { ; IOS-WITH-STRET-NEXT: vpop {d8} ; IOS-WITH-STRET-NEXT: pop {lr} ; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_v2f32: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: vpush {d8, d9, d10} +; WATCHABI-NEXT: .cfi_def_cfa_offset 32 +; WATCHABI-NEXT: .cfi_offset d10, -16 +; WATCHABI-NEXT: .cfi_offset d9, -24 +; WATCHABI-NEXT: .cfi_offset d8, -32 +; WATCHABI-NEXT: vmov.f64 d8, d0 +; WATCHABI-NEXT: vmov.f32 s0, s17 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vmov.f32 s19, s0 +; WATCHABI-NEXT: vmov.f32 s0, s16 +; WATCHABI-NEXT: vmov.f32 s21, s1 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vmov.f32 s20, s1 +; WATCHABI-NEXT: vmov.f32 s18, s0 +; WATCHABI-NEXT: vmov.f64 d1, d10 +; WATCHABI-NEXT: vmov.f64 d0, d9 +; WATCHABI-NEXT: vpop {d8, d9, d10} +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a) ret { <2 x float>, <2 x float> } %result } @@ -581,6 +720,20 @@ define { double, double } @test_sincos_f64(double %a) { ; IOS-WITH-STRET-NEXT: add sp, sp, #16 ; IOS-WITH-STRET-NEXT: pop {lr} ; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f64: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: bl ___sincos_stret +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { double, double } @llvm.sincos.f64(double %a) ret { double, double } %result } @@ -692,6 +845,39 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) { ; IOS-WITH-STRET-NEXT: vst1.32 {d18, d19}, [r6] ; IOS-WITH-STRET-NEXT: add sp, sp, #32 ; IOS-WITH-STRET-NEXT: pop {r4, r5, r6, pc} +; +; WATCHABI-LABEL: test_sincos_v2f64: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; WATCHABI-NEXT: .cfi_def_cfa_offset 56 +; WATCHABI-NEXT: .cfi_offset d13, -16 +; WATCHABI-NEXT: .cfi_offset d12, -24 +; WATCHABI-NEXT: .cfi_offset d11, -32 +; WATCHABI-NEXT: .cfi_offset d10, -40 +; WATCHABI-NEXT: .cfi_offset d9, -48 +; WATCHABI-NEXT: .cfi_offset d8, -56 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 64 +; WATCHABI-NEXT: vorr q4, q0, q0 +; WATCHABI-NEXT: vorr d0, d9, d9 +; WATCHABI-NEXT: bl ___sincos_stret +; WATCHABI-NEXT: vorr d11, d0, d0 +; WATCHABI-NEXT: vorr d0, d8, d8 +; WATCHABI-NEXT: vorr d13, d1, d1 +; WATCHABI-NEXT: bl ___sincos_stret +; WATCHABI-NEXT: vorr d12, d1, d1 +; WATCHABI-NEXT: vorr d10, d0, d0 +; WATCHABI-NEXT: vorr q1, q6, q6 +; WATCHABI-NEXT: vorr q0, q5, q5 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a) ret { <2 x double>, <2 x double> } %result } @@ -778,6 +964,41 @@ define { fp128, fp128 } @test_sincos_f128(fp128 %a) { ; IOS-NEXT: bl _sinl ; IOS-NEXT: stm r4, {r0, r1, r2, r3} ; IOS-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; WATCHABI-LABEL: test_sincos_f128: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 24 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: .cfi_offset r6, -12 +; WATCHABI-NEXT: .cfi_offset r5, -16 +; WATCHABI-NEXT: .cfi_offset r4, -20 +; WATCHABI-NEXT: .cfi_offset r8, -24 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 32 +; WATCHABI-NEXT: ldr.w r8, [sp, #32] +; WATCHABI-NEXT: mov r4, r0 +; WATCHABI-NEXT: mov r5, r3 +; WATCHABI-NEXT: mov r6, r2 +; WATCHABI-NEXT: mov r7, r1 +; WATCHABI-NEXT: mov r0, r1 +; WATCHABI-NEXT: mov r1, r2 +; WATCHABI-NEXT: mov r2, r3 +; WATCHABI-NEXT: mov r3, r8 +; WATCHABI-NEXT: bl _cosl +; WATCHABI-NEXT: add.w r9, r4, #16 +; WATCHABI-NEXT: stm.w r9, {r0, r1, r2, r3} +; WATCHABI-NEXT: mov r0, r7 +; WATCHABI-NEXT: mov r1, r6 +; WATCHABI-NEXT: mov r2, r5 +; WATCHABI-NEXT: mov r3, r8 +; WATCHABI-NEXT: bl _sinl +; WATCHABI-NEXT: stm r4!, {r0, r1, r2, r3} +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { fp128, fp128 } @llvm.sincos.f16(fp128 %a) ret { fp128, fp128 } %result } From 70ff2c9a32ec2fa123869be9ca6efdd2f24fca1f Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu <min.hsu@sifive.com> Date: Mon, 3 Nov 2025 18:28:33 -0800 Subject: [PATCH 114/313] [doc][RISCV] Add XSfvfexp* and XSfvfexpa* into RISCVUsage.rst (#166198) They were introduced in #164349 --- llvm/docs/RISCVUsage.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 49184e3104868..d03f383a92b3b 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -406,6 +406,12 @@ The current vendor extensions supported are: ``XSfvcp`` LLVM implements `version 1.1.0 of the SiFive Vector Coprocessor Interface (VCIX) Software Specification <https://sifive.cdn.prismic.io/sifive/Zn3m1R5LeNNTwnLS_vcix-spec-software-v1p1.pdf>`__ by SiFive. All instructions are prefixed with `sf.vc.` as described in the specification, and the riscv-toolchain-convention document linked above. +``Xsfvfexp16e``, ``Xsfvfbfexp16e``, and ``Xsfvfexp32e`` + LLVM implements `version 0.5 of the Vector Exponential Extension Specification <https://www.sifive.com/document-file/exponential-function-instruction-xsfvfexp32e-xsfvf>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above. + +``Xsfvfexpa`` and ``Xsfvfexpa64e`` + LLVM implements `version 0.2 of the Vector Exponential Approximation Extension Specification <https://www.sifive.com/document-file/exponential-approximation-instruction-xsfvfexpa-ex>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above. + ``XSfvqmaccdod``, ``XSfvqmaccqoq`` LLVM implements `version 1.1.0 of the SiFive Int8 Matrix Multiplication Extensions Specification <https://sifive.cdn.prismic.io/sifive/1a2ad85b-d818-49f7-ba83-f51f1731edbe_int8-matmul-spec.pdf>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above. From c63cb50080fb9c8a0650234cb711823db306f5c6 Mon Sep 17 00:00:00 2001 From: Jianjian Guan <jacquesguan@me.com> Date: Tue, 4 Nov 2025 10:53:17 +0800 Subject: [PATCH 115/313] [RISCV][GISel] Support select indexed vector load store intrinsics (#165876) --- .../RISCV/GISel/RISCVInstructionSelector.cpp | 125 +- .../RISCV/GlobalISel/rvv/vloxei-rv64.ll | 1341 +++++ .../CodeGen/RISCV/GlobalISel/rvv/vloxei.ll | 5100 +++++++++++++++++ .../RISCV/GlobalISel/rvv/vluxei-rv64.ll | 1341 +++++ .../CodeGen/RISCV/GlobalISel/rvv/vluxei.ll | 5100 +++++++++++++++++ .../RISCV/GlobalISel/rvv/vsoxei-rv64.ll | 1293 +++++ .../CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll | 4881 ++++++++++++++++ .../RISCV/GlobalISel/rvv/vsuxei-rv64.ll | 1310 +++++ .../CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll | 4881 ++++++++++++++++ 9 files changed, 25368 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 282cf5d681685..3d5a55c631301 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -95,7 +95,8 @@ class RISCVInstructionSelector : public InstructionSelector { void addVectorLoadStoreOperands(MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, bool IsMasked, - bool IsStrided) const; + bool IsStridedOrIndexed, + LLT *IndexVT = nullptr) const; bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineIRBuilder &MIB) const; @@ -722,15 +723,17 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { void RISCVInstructionSelector::addVectorLoadStoreOperands( MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, - bool IsMasked, bool IsStrided) const { + bool IsMasked, bool IsStridedOrIndexed, LLT *IndexVT) const { // Base Pointer auto PtrReg = I.getOperand(CurOp++).getReg(); SrcOps.push_back(PtrReg); - // Stride - if (IsStrided) { + // Stride or Index + if (IsStridedOrIndexed) { auto StrideReg = I.getOperand(CurOp++).getReg(); SrcOps.push_back(StrideReg); + if (IndexVT) + *IndexVT = MRI->getType(StrideReg); } // Mask @@ -805,6 +808,70 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vloxei: + case Intrinsic::riscv_vloxei_mask: + case Intrinsic::riscv_vluxei: + case Intrinsic::riscv_vluxei_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vloxei_mask || + IntrinID == Intrinsic::riscv_vluxei_mask; + bool IsOrdered = IntrinID == Intrinsic::riscv_vloxei || + IntrinID == Intrinsic::riscv_vloxei_mask; + LLT VT = MRI->getType(I.getOperand(0).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Result vector + const Register DstReg = I.getOperand(0).getReg(); + + // Sources + bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm; + unsigned CurOp = 2; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Passthru + if (HasPassthruOperand) { + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + } else { + // Use NoRegister if there is no specified passthru. + SrcOps.push_back(Register()); + } + LLT IndexVT; + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + RISCVVType::VLMUL IndexLMUL = + RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT)); + unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } + const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo( + IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + static_cast<unsigned>(IndexLMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Policy + uint64_t Policy = RISCVVType::MASK_AGNOSTIC; + if (IsMasked) + Policy = I.getOperand(CurOp++).getImm(); + PseudoMI.addImm(Policy); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } case Intrinsic::riscv_vsm: case Intrinsic::riscv_vse: case Intrinsic::riscv_vse_mask: @@ -847,6 +914,56 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vsoxei: + case Intrinsic::riscv_vsoxei_mask: + case Intrinsic::riscv_vsuxei: + case Intrinsic::riscv_vsuxei_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vsoxei_mask || + IntrinID == Intrinsic::riscv_vsuxei_mask; + bool IsOrdered = IntrinID == Intrinsic::riscv_vsoxei || + IntrinID == Intrinsic::riscv_vsoxei_mask; + LLT VT = MRI->getType(I.getOperand(1).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Sources + unsigned CurOp = 1; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Store value + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + + LLT IndexVT; + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + RISCVVType::VLMUL IndexLMUL = + RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT)); + unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } + const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo( + IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + static_cast<unsigned>(IndexLMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } } } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll new file mode 100644 index 0000000000000..5cb55f15c7c8c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll @@ -0,0 +1,1341 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll new file mode 100644 index 0000000000000..fafd45b7579e8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll @@ -0,0 +1,5100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vloxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei16.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vloxei16.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vloxei16.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> poison, + ptr %0, + <vscale x 64 x i8> %1, + iXLen %2) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll new file mode 100644 index 0000000000000..916af2556c6a8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll @@ -0,0 +1,1341 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll new file mode 100644 index 0000000000000..8dd32a1d640dc --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll @@ -0,0 +1,5100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vluxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei16.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vluxei16.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vluxei16.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> poison, + ptr %0, + <vscale x 64 x i8> %1, + iXLen %2) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll new file mode 100644 index 0000000000000..4963d91a14988 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll @@ -0,0 +1,1293 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll new file mode 100644 index 0000000000000..7ea2e1734e5a2 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll @@ -0,0 +1,4881 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll new file mode 100644 index 0000000000000..9bd272a368d20 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll @@ -0,0 +1,1310 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +define void @intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> splat (i1 true), + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll new file mode 100644 index 0000000000000..7cd15454d40b9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll @@ -0,0 +1,4881 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} From d470bfe0d46480ebf7f4a2e1dd37005894acb298 Mon Sep 17 00:00:00 2001 From: camc <69519329+camc@users.noreply.github.com> Date: Tue, 4 Nov 2025 03:12:27 +0000 Subject: [PATCH 116/313] [clang] Mark labels referenced when used in named break or continue (#166033) Fixes #166013 Marks labels that appear in a c2y named break or continue statement as referenced to fix false-positive unused diagnostics. --------- Co-authored-by: camc <pushy-crop-cartel@duck.com> --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Sema/SemaStmt.cpp | 3 +++ clang/test/Sema/labeled-break-continue.c | 18 +++++++++++++++--- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 2c575cd1d4289..2e2c5198fa8f5 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -395,6 +395,8 @@ Improvements to Clang's diagnostics that were previously incorrectly accepted in case of other irrelevant conditions are now consistently diagnosed, identical to C++ mode. +- Fix false-positive unused label diagnostic when a label is used in a named break + or continue (#GH166013) - Clang now emits a diagnostic in case `vector_size` or `ext_vector_type` attributes are used with a negative size (#GH165463). diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index f39896336053e..5b3ef1adf38e3 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3281,6 +3281,9 @@ static Scope *FindLabeledBreakContinueScope(Sema &S, Scope *CurScope, SourceLocation LabelLoc, bool IsContinue) { assert(Target && "not a named break/continue?"); + + Target->markUsed(S.Context); + Scope *Found = nullptr; for (Scope *Scope = CurScope; Scope; Scope = Scope->getParent()) { if (Scope->isFunctionScope()) diff --git a/clang/test/Sema/labeled-break-continue.c b/clang/test/Sema/labeled-break-continue.c index 78f81c484c3d5..6b4adc23dca8d 100644 --- a/clang/test/Sema/labeled-break-continue.c +++ b/clang/test/Sema/labeled-break-continue.c @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -std=c2y -verify -fsyntax-only -fblocks %s -// RUN: %clang_cc1 -std=c23 -verify -fsyntax-only -fblocks -fnamed-loops %s -// RUN: %clang_cc1 -x c++ -verify -fsyntax-only -fblocks -fnamed-loops %s +// RUN: %clang_cc1 -std=c2y -verify -Wunused -fsyntax-only -fblocks %s +// RUN: %clang_cc1 -std=c23 -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s +// RUN: %clang_cc1 -x c++ -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s void f1() { l1: while (true) { @@ -159,3 +159,15 @@ void f7() { continue d; // expected-error {{'continue' label does not name an enclosing loop}} } } + +void f8() { + l1: // no-warning + while (true) { + break l1; + } + + l2: // no-warning + while (true) { + continue l2; + } +} From 2b45efe92013e1372a2d49f550dd226ff351c594 Mon Sep 17 00:00:00 2001 From: choikwa <5455710+choikwa@users.noreply.github.com> Date: Mon, 3 Nov 2025 22:21:26 -0500 Subject: [PATCH 117/313] [AMDGPU] NFC, move testcase, only test output of promote-alloca with vector-combine (#166289) --- .../extract-insert-chain-to-shuffles.ll | 567 ++++++++++++++++++ 1 file changed, 567 insertions(+) create mode 100644 llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll new file mode 100644 index 0000000000000..4b551fad5b43a --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll @@ -0,0 +1,567 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=vector-combine < %s | FileCheck -check-prefix=OPT %s + +; Generated from amdgpu-promote-alloca on array of vectors +; VectorCombiner should recognize chain of extract-insert vectors +; and turn them into one or two shuffles +define amdgpu_kernel void @extract_insert_chain_to_shuffles(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 { +; OPT-LABEL: define amdgpu_kernel void @extract_insert_chain_to_shuffles( +; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison +; OPT-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <128 x i8> [[ALLOCA]], <128 x i8> [[TMP0]], <128 x i32> <i32 128, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5 +; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6 +; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7 +; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8 +; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9 +; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10 +; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11 +; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12 +; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13 +; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14 +; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15 +; OPT-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP33:%.*]] = shufflevector <128 x i8> [[TMP31]], <128 x i8> [[TMP32]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 128, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 17 +; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 18 +; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 19 +; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 20 +; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 21 +; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 22 +; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 23 +; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 24 +; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 25 +; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 26 +; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 27 +; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 28 +; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 29 +; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 30 +; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 31 +; OPT-NEXT: [[TMP64:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP65:%.*]] = shufflevector <128 x i8> [[TMP63]], <128 x i8> [[TMP64]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 128, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 33 +; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 34 +; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 35 +; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 36 +; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 37 +; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 38 +; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 39 +; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 40 +; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 41 +; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 42 +; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 43 +; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 44 +; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 45 +; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 46 +; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 47 +; OPT-NEXT: [[TMP96:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP97:%.*]] = shufflevector <128 x i8> [[TMP95]], <128 x i8> [[TMP96]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 128, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 49 +; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 50 +; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 51 +; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 52 +; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 53 +; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 54 +; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 55 +; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 56 +; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 57 +; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 58 +; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 59 +; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 60 +; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 61 +; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 62 +; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 63 +; OPT-NEXT: [[TMP128:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP129:%.*]] = shufflevector <128 x i8> [[TMP127]], <128 x i8> [[TMP128]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 128, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 65 +; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 66 +; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 67 +; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 68 +; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 69 +; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 70 +; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 71 +; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 72 +; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 73 +; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 74 +; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 75 +; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 76 +; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 77 +; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 78 +; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 79 +; OPT-NEXT: [[TMP160:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP161:%.*]] = shufflevector <128 x i8> [[TMP159]], <128 x i8> [[TMP160]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 128, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 81 +; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 82 +; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 83 +; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 84 +; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 85 +; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 86 +; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 87 +; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 88 +; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 89 +; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 90 +; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 91 +; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 92 +; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 93 +; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 94 +; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 95 +; OPT-NEXT: [[TMP192:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP193:%.*]] = shufflevector <128 x i8> [[TMP191]], <128 x i8> [[TMP192]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 128, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 97 +; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 98 +; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 99 +; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 100 +; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 101 +; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 102 +; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 103 +; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 104 +; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 105 +; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 106 +; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 107 +; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 108 +; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 109 +; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 110 +; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 111 +; OPT-NEXT: [[TMP224:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP225:%.*]] = shufflevector <128 x i8> [[TMP223]], <128 x i8> [[TMP224]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 128, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 113 +; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 114 +; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 115 +; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 116 +; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 117 +; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 118 +; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 119 +; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 120 +; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 121 +; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 122 +; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 123 +; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 124 +; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 125 +; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 126 +; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 127 +; OPT-NEXT: [[TMP256:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> [[TMP256]], i8 [[TMP162]], i64 1 +; OPT-NEXT: [[TMP258:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP164]], i64 2 +; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP258]], i8 [[TMP166]], i64 3 +; OPT-NEXT: [[TMP260:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP168]], i64 4 +; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP260]], i8 [[TMP170]], i64 5 +; OPT-NEXT: [[TMP262:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP172]], i64 6 +; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP262]], i8 [[TMP174]], i64 7 +; OPT-NEXT: [[TMP264:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP176]], i64 8 +; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP264]], i8 [[TMP178]], i64 9 +; OPT-NEXT: [[TMP266:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP180]], i64 10 +; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP266]], i8 [[TMP182]], i64 11 +; OPT-NEXT: [[TMP268:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP184]], i64 12 +; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP268]], i8 [[TMP186]], i64 13 +; OPT-NEXT: [[TMP270:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP188]], i64 14 +; OPT-NEXT: [[TMP271:%.*]] = shufflevector <16 x i8> [[TMP270]], <16 x i8> [[IN]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31> +; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP271]], [[ADD]] +; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16 +; OPT-NEXT: ret void +; +entry: + %alloca = freeze <128 x i8> poison + %0 = extractelement <16 x i8> %in, i64 0 + %1 = insertelement <128 x i8> %alloca, i8 %0, i32 0 + %2 = extractelement <16 x i8> %in, i64 1 + %3 = insertelement <128 x i8> %1, i8 %2, i32 1 + %4 = extractelement <16 x i8> %in, i64 2 + %5 = insertelement <128 x i8> %3, i8 %4, i32 2 + %6 = extractelement <16 x i8> %in, i64 3 + %7 = insertelement <128 x i8> %5, i8 %6, i32 3 + %8 = extractelement <16 x i8> %in, i64 4 + %9 = insertelement <128 x i8> %7, i8 %8, i32 4 + %10 = extractelement <16 x i8> %in, i64 5 + %11 = insertelement <128 x i8> %9, i8 %10, i32 5 + %12 = extractelement <16 x i8> %in, i64 6 + %13 = insertelement <128 x i8> %11, i8 %12, i32 6 + %14 = extractelement <16 x i8> %in, i64 7 + %15 = insertelement <128 x i8> %13, i8 %14, i32 7 + %16 = extractelement <16 x i8> %in, i64 8 + %17 = insertelement <128 x i8> %15, i8 %16, i32 8 + %18 = extractelement <16 x i8> %in, i64 9 + %19 = insertelement <128 x i8> %17, i8 %18, i32 9 + %20 = extractelement <16 x i8> %in, i64 10 + %21 = insertelement <128 x i8> %19, i8 %20, i32 10 + %22 = extractelement <16 x i8> %in, i64 11 + %23 = insertelement <128 x i8> %21, i8 %22, i32 11 + %24 = extractelement <16 x i8> %in, i64 12 + %25 = insertelement <128 x i8> %23, i8 %24, i32 12 + %26 = extractelement <16 x i8> %in, i64 13 + %27 = insertelement <128 x i8> %25, i8 %26, i32 13 + %28 = extractelement <16 x i8> %in, i64 14 + %29 = insertelement <128 x i8> %27, i8 %28, i32 14 + %30 = extractelement <16 x i8> %in, i64 15 + %31 = insertelement <128 x i8> %29, i8 %30, i32 15 + %32 = extractelement <16 x i8> %in, i64 0 + %33 = insertelement <128 x i8> %31, i8 %32, i32 16 + %34 = extractelement <16 x i8> %in, i64 1 + %35 = insertelement <128 x i8> %33, i8 %34, i32 17 + %36 = extractelement <16 x i8> %in, i64 2 + %37 = insertelement <128 x i8> %35, i8 %36, i32 18 + %38 = extractelement <16 x i8> %in, i64 3 + %39 = insertelement <128 x i8> %37, i8 %38, i32 19 + %40 = extractelement <16 x i8> %in, i64 4 + %41 = insertelement <128 x i8> %39, i8 %40, i32 20 + %42 = extractelement <16 x i8> %in, i64 5 + %43 = insertelement <128 x i8> %41, i8 %42, i32 21 + %44 = extractelement <16 x i8> %in, i64 6 + %45 = insertelement <128 x i8> %43, i8 %44, i32 22 + %46 = extractelement <16 x i8> %in, i64 7 + %47 = insertelement <128 x i8> %45, i8 %46, i32 23 + %48 = extractelement <16 x i8> %in, i64 8 + %49 = insertelement <128 x i8> %47, i8 %48, i32 24 + %50 = extractelement <16 x i8> %in, i64 9 + %51 = insertelement <128 x i8> %49, i8 %50, i32 25 + %52 = extractelement <16 x i8> %in, i64 10 + %53 = insertelement <128 x i8> %51, i8 %52, i32 26 + %54 = extractelement <16 x i8> %in, i64 11 + %55 = insertelement <128 x i8> %53, i8 %54, i32 27 + %56 = extractelement <16 x i8> %in, i64 12 + %57 = insertelement <128 x i8> %55, i8 %56, i32 28 + %58 = extractelement <16 x i8> %in, i64 13 + %59 = insertelement <128 x i8> %57, i8 %58, i32 29 + %60 = extractelement <16 x i8> %in, i64 14 + %61 = insertelement <128 x i8> %59, i8 %60, i32 30 + %62 = extractelement <16 x i8> %in, i64 15 + %63 = insertelement <128 x i8> %61, i8 %62, i32 31 + %64 = extractelement <16 x i8> %in, i64 0 + %65 = insertelement <128 x i8> %63, i8 %64, i32 32 + %66 = extractelement <16 x i8> %in, i64 1 + %67 = insertelement <128 x i8> %65, i8 %66, i32 33 + %68 = extractelement <16 x i8> %in, i64 2 + %69 = insertelement <128 x i8> %67, i8 %68, i32 34 + %70 = extractelement <16 x i8> %in, i64 3 + %71 = insertelement <128 x i8> %69, i8 %70, i32 35 + %72 = extractelement <16 x i8> %in, i64 4 + %73 = insertelement <128 x i8> %71, i8 %72, i32 36 + %74 = extractelement <16 x i8> %in, i64 5 + %75 = insertelement <128 x i8> %73, i8 %74, i32 37 + %76 = extractelement <16 x i8> %in, i64 6 + %77 = insertelement <128 x i8> %75, i8 %76, i32 38 + %78 = extractelement <16 x i8> %in, i64 7 + %79 = insertelement <128 x i8> %77, i8 %78, i32 39 + %80 = extractelement <16 x i8> %in, i64 8 + %81 = insertelement <128 x i8> %79, i8 %80, i32 40 + %82 = extractelement <16 x i8> %in, i64 9 + %83 = insertelement <128 x i8> %81, i8 %82, i32 41 + %84 = extractelement <16 x i8> %in, i64 10 + %85 = insertelement <128 x i8> %83, i8 %84, i32 42 + %86 = extractelement <16 x i8> %in, i64 11 + %87 = insertelement <128 x i8> %85, i8 %86, i32 43 + %88 = extractelement <16 x i8> %in, i64 12 + %89 = insertelement <128 x i8> %87, i8 %88, i32 44 + %90 = extractelement <16 x i8> %in, i64 13 + %91 = insertelement <128 x i8> %89, i8 %90, i32 45 + %92 = extractelement <16 x i8> %in, i64 14 + %93 = insertelement <128 x i8> %91, i8 %92, i32 46 + %94 = extractelement <16 x i8> %in, i64 15 + %95 = insertelement <128 x i8> %93, i8 %94, i32 47 + %96 = extractelement <16 x i8> %in, i64 0 + %97 = insertelement <128 x i8> %95, i8 %96, i32 48 + %98 = extractelement <16 x i8> %in, i64 1 + %99 = insertelement <128 x i8> %97, i8 %98, i32 49 + %100 = extractelement <16 x i8> %in, i64 2 + %101 = insertelement <128 x i8> %99, i8 %100, i32 50 + %102 = extractelement <16 x i8> %in, i64 3 + %103 = insertelement <128 x i8> %101, i8 %102, i32 51 + %104 = extractelement <16 x i8> %in, i64 4 + %105 = insertelement <128 x i8> %103, i8 %104, i32 52 + %106 = extractelement <16 x i8> %in, i64 5 + %107 = insertelement <128 x i8> %105, i8 %106, i32 53 + %108 = extractelement <16 x i8> %in, i64 6 + %109 = insertelement <128 x i8> %107, i8 %108, i32 54 + %110 = extractelement <16 x i8> %in, i64 7 + %111 = insertelement <128 x i8> %109, i8 %110, i32 55 + %112 = extractelement <16 x i8> %in, i64 8 + %113 = insertelement <128 x i8> %111, i8 %112, i32 56 + %114 = extractelement <16 x i8> %in, i64 9 + %115 = insertelement <128 x i8> %113, i8 %114, i32 57 + %116 = extractelement <16 x i8> %in, i64 10 + %117 = insertelement <128 x i8> %115, i8 %116, i32 58 + %118 = extractelement <16 x i8> %in, i64 11 + %119 = insertelement <128 x i8> %117, i8 %118, i32 59 + %120 = extractelement <16 x i8> %in, i64 12 + %121 = insertelement <128 x i8> %119, i8 %120, i32 60 + %122 = extractelement <16 x i8> %in, i64 13 + %123 = insertelement <128 x i8> %121, i8 %122, i32 61 + %124 = extractelement <16 x i8> %in, i64 14 + %125 = insertelement <128 x i8> %123, i8 %124, i32 62 + %126 = extractelement <16 x i8> %in, i64 15 + %127 = insertelement <128 x i8> %125, i8 %126, i32 63 + %128 = extractelement <16 x i8> %in, i64 0 + %129 = insertelement <128 x i8> %127, i8 %128, i32 64 + %130 = extractelement <16 x i8> %in, i64 1 + %131 = insertelement <128 x i8> %129, i8 %130, i32 65 + %132 = extractelement <16 x i8> %in, i64 2 + %133 = insertelement <128 x i8> %131, i8 %132, i32 66 + %134 = extractelement <16 x i8> %in, i64 3 + %135 = insertelement <128 x i8> %133, i8 %134, i32 67 + %136 = extractelement <16 x i8> %in, i64 4 + %137 = insertelement <128 x i8> %135, i8 %136, i32 68 + %138 = extractelement <16 x i8> %in, i64 5 + %139 = insertelement <128 x i8> %137, i8 %138, i32 69 + %140 = extractelement <16 x i8> %in, i64 6 + %141 = insertelement <128 x i8> %139, i8 %140, i32 70 + %142 = extractelement <16 x i8> %in, i64 7 + %143 = insertelement <128 x i8> %141, i8 %142, i32 71 + %144 = extractelement <16 x i8> %in, i64 8 + %145 = insertelement <128 x i8> %143, i8 %144, i32 72 + %146 = extractelement <16 x i8> %in, i64 9 + %147 = insertelement <128 x i8> %145, i8 %146, i32 73 + %148 = extractelement <16 x i8> %in, i64 10 + %149 = insertelement <128 x i8> %147, i8 %148, i32 74 + %150 = extractelement <16 x i8> %in, i64 11 + %151 = insertelement <128 x i8> %149, i8 %150, i32 75 + %152 = extractelement <16 x i8> %in, i64 12 + %153 = insertelement <128 x i8> %151, i8 %152, i32 76 + %154 = extractelement <16 x i8> %in, i64 13 + %155 = insertelement <128 x i8> %153, i8 %154, i32 77 + %156 = extractelement <16 x i8> %in, i64 14 + %157 = insertelement <128 x i8> %155, i8 %156, i32 78 + %158 = extractelement <16 x i8> %in, i64 15 + %159 = insertelement <128 x i8> %157, i8 %158, i32 79 + %160 = extractelement <16 x i8> %in, i64 0 + %161 = insertelement <128 x i8> %159, i8 %160, i32 80 + %162 = extractelement <16 x i8> %in, i64 1 + %163 = insertelement <128 x i8> %161, i8 %162, i32 81 + %164 = extractelement <16 x i8> %in, i64 2 + %165 = insertelement <128 x i8> %163, i8 %164, i32 82 + %166 = extractelement <16 x i8> %in, i64 3 + %167 = insertelement <128 x i8> %165, i8 %166, i32 83 + %168 = extractelement <16 x i8> %in, i64 4 + %169 = insertelement <128 x i8> %167, i8 %168, i32 84 + %170 = extractelement <16 x i8> %in, i64 5 + %171 = insertelement <128 x i8> %169, i8 %170, i32 85 + %172 = extractelement <16 x i8> %in, i64 6 + %173 = insertelement <128 x i8> %171, i8 %172, i32 86 + %174 = extractelement <16 x i8> %in, i64 7 + %175 = insertelement <128 x i8> %173, i8 %174, i32 87 + %176 = extractelement <16 x i8> %in, i64 8 + %177 = insertelement <128 x i8> %175, i8 %176, i32 88 + %178 = extractelement <16 x i8> %in, i64 9 + %179 = insertelement <128 x i8> %177, i8 %178, i32 89 + %180 = extractelement <16 x i8> %in, i64 10 + %181 = insertelement <128 x i8> %179, i8 %180, i32 90 + %182 = extractelement <16 x i8> %in, i64 11 + %183 = insertelement <128 x i8> %181, i8 %182, i32 91 + %184 = extractelement <16 x i8> %in, i64 12 + %185 = insertelement <128 x i8> %183, i8 %184, i32 92 + %186 = extractelement <16 x i8> %in, i64 13 + %187 = insertelement <128 x i8> %185, i8 %186, i32 93 + %188 = extractelement <16 x i8> %in, i64 14 + %189 = insertelement <128 x i8> %187, i8 %188, i32 94 + %190 = extractelement <16 x i8> %in, i64 15 + %191 = insertelement <128 x i8> %189, i8 %190, i32 95 + %192 = extractelement <16 x i8> %in, i64 0 + %193 = insertelement <128 x i8> %191, i8 %192, i32 96 + %194 = extractelement <16 x i8> %in, i64 1 + %195 = insertelement <128 x i8> %193, i8 %194, i32 97 + %196 = extractelement <16 x i8> %in, i64 2 + %197 = insertelement <128 x i8> %195, i8 %196, i32 98 + %198 = extractelement <16 x i8> %in, i64 3 + %199 = insertelement <128 x i8> %197, i8 %198, i32 99 + %200 = extractelement <16 x i8> %in, i64 4 + %201 = insertelement <128 x i8> %199, i8 %200, i32 100 + %202 = extractelement <16 x i8> %in, i64 5 + %203 = insertelement <128 x i8> %201, i8 %202, i32 101 + %204 = extractelement <16 x i8> %in, i64 6 + %205 = insertelement <128 x i8> %203, i8 %204, i32 102 + %206 = extractelement <16 x i8> %in, i64 7 + %207 = insertelement <128 x i8> %205, i8 %206, i32 103 + %208 = extractelement <16 x i8> %in, i64 8 + %209 = insertelement <128 x i8> %207, i8 %208, i32 104 + %210 = extractelement <16 x i8> %in, i64 9 + %211 = insertelement <128 x i8> %209, i8 %210, i32 105 + %212 = extractelement <16 x i8> %in, i64 10 + %213 = insertelement <128 x i8> %211, i8 %212, i32 106 + %214 = extractelement <16 x i8> %in, i64 11 + %215 = insertelement <128 x i8> %213, i8 %214, i32 107 + %216 = extractelement <16 x i8> %in, i64 12 + %217 = insertelement <128 x i8> %215, i8 %216, i32 108 + %218 = extractelement <16 x i8> %in, i64 13 + %219 = insertelement <128 x i8> %217, i8 %218, i32 109 + %220 = extractelement <16 x i8> %in, i64 14 + %221 = insertelement <128 x i8> %219, i8 %220, i32 110 + %222 = extractelement <16 x i8> %in, i64 15 + %223 = insertelement <128 x i8> %221, i8 %222, i32 111 + %224 = extractelement <16 x i8> %in, i64 0 + %225 = insertelement <128 x i8> %223, i8 %224, i32 112 + %226 = extractelement <16 x i8> %in, i64 1 + %227 = insertelement <128 x i8> %225, i8 %226, i32 113 + %228 = extractelement <16 x i8> %in, i64 2 + %229 = insertelement <128 x i8> %227, i8 %228, i32 114 + %230 = extractelement <16 x i8> %in, i64 3 + %231 = insertelement <128 x i8> %229, i8 %230, i32 115 + %232 = extractelement <16 x i8> %in, i64 4 + %233 = insertelement <128 x i8> %231, i8 %232, i32 116 + %234 = extractelement <16 x i8> %in, i64 5 + %235 = insertelement <128 x i8> %233, i8 %234, i32 117 + %236 = extractelement <16 x i8> %in, i64 6 + %237 = insertelement <128 x i8> %235, i8 %236, i32 118 + %238 = extractelement <16 x i8> %in, i64 7 + %239 = insertelement <128 x i8> %237, i8 %238, i32 119 + %240 = extractelement <16 x i8> %in, i64 8 + %241 = insertelement <128 x i8> %239, i8 %240, i32 120 + %242 = extractelement <16 x i8> %in, i64 9 + %243 = insertelement <128 x i8> %241, i8 %242, i32 121 + %244 = extractelement <16 x i8> %in, i64 10 + %245 = insertelement <128 x i8> %243, i8 %244, i32 122 + %246 = extractelement <16 x i8> %in, i64 11 + %247 = insertelement <128 x i8> %245, i8 %246, i32 123 + %248 = extractelement <16 x i8> %in, i64 12 + %249 = insertelement <128 x i8> %247, i8 %248, i32 124 + %250 = extractelement <16 x i8> %in, i64 13 + %251 = insertelement <128 x i8> %249, i8 %250, i32 125 + %252 = extractelement <16 x i8> %in, i64 14 + %253 = insertelement <128 x i8> %251, i8 %252, i32 126 + %254 = extractelement <16 x i8> %in, i64 15 + %255 = insertelement <128 x i8> %253, i8 %254, i32 127 + %256 = insertelement <16 x i8> poison, i8 %160, i64 0 + %257 = insertelement <16 x i8> %256, i8 %162, i64 1 + %258 = insertelement <16 x i8> %257, i8 %164, i64 2 + %259 = insertelement <16 x i8> %258, i8 %166, i64 3 + %260 = insertelement <16 x i8> %259, i8 %168, i64 4 + %261 = insertelement <16 x i8> %260, i8 %170, i64 5 + %262 = insertelement <16 x i8> %261, i8 %172, i64 6 + %263 = insertelement <16 x i8> %262, i8 %174, i64 7 + %264 = insertelement <16 x i8> %263, i8 %176, i64 8 + %265 = insertelement <16 x i8> %264, i8 %178, i64 9 + %266 = insertelement <16 x i8> %265, i8 %180, i64 10 + %267 = insertelement <16 x i8> %266, i8 %182, i64 11 + %268 = insertelement <16 x i8> %267, i8 %184, i64 12 + %269 = insertelement <16 x i8> %268, i8 %186, i64 13 + %270 = insertelement <16 x i8> %269, i8 %188, i64 14 + %271 = insertelement <16 x i8> %270, i8 %190, i64 15 + %sum = add <16 x i8> %271, %add + store <16 x i8> %sum, ptr addrspace(3) %out, align 16 + ret void +} + +attributes #0 = { "amdgpu-waves-per-eu"="2,2" } From bb563b196f0e70b2790cdfe2619fbd34f273b508 Mon Sep 17 00:00:00 2001 From: Kelvin Li <kli@ca.ibm.com> Date: Mon, 3 Nov 2025 22:58:56 -0500 Subject: [PATCH 118/313] [OpenMP][AIX] Not to create symbolic links to libomp.so on AIX (#165890) On AIX, it generates libomp for both static and dynamic. There is no need to create symbolic links to libomp.so. --------- Co-authored-by: Xing Xue <xingxue@outlook.com> --- openmp/runtime/src/CMakeLists.txt | 40 ++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt index 6ac047a833fe5..5dd7f4b33612d 100644 --- a/openmp/runtime/src/CMakeLists.txt +++ b/openmp/runtime/src/CMakeLists.txt @@ -254,23 +254,35 @@ set(LIBOMP_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE) # Add symbolic links to libomp if(NOT WIN32) - add_custom_command(TARGET omp POST_BUILD - COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} - libgomp${LIBOMP_LIBRARY_SUFFIX} - COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} - libiomp5${LIBOMP_LIBRARY_SUFFIX} - WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR} - ) - if(LIBOMP_ENABLE_SHARED) - if(APPLE) - set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX}) - else() - set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1) - endif() + if(AIX) + # On AIX, libomp.a is the name for both static and shared objects. + set(LIBOMP_AIX_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) add_custom_command(TARGET omp POST_BUILD - COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME} + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${LIBOMP_LIB_NAME}${LIBOMP_AIX_SUFFIX} libgomp${LIBOMP_AIX_SUFFIX} + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${LIBOMP_LIB_NAME}${LIBOMP_AIX_SUFFIX} libiomp5${LIBOMP_AIX_SUFFIX} WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR} ) + else() + add_custom_command(TARGET omp POST_BUILD + COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} + libiomp5${LIBOMP_LIBRARY_SUFFIX} + COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} + libgomp${LIBOMP_LIBRARY_SUFFIX} + WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR} + ) + if(LIBOMP_ENABLE_SHARED) + if(APPLE) + set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX}) + else() + set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1) + endif() + add_custom_command(TARGET omp POST_BUILD + COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME} + WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR} + ) + endif() endif() endif() From 00ee53cc7b555ce408e6cd86378e3112bbdb0db8 Mon Sep 17 00:00:00 2001 From: Shoreshen <372660931@qq.com> Date: Tue, 4 Nov 2025 12:26:17 +0800 Subject: [PATCH 119/313] [Attributor] Propagate alignment through ptrmask (#150158) Propagate alignment through ptrmask based on potential constant values of mask and align of ptr. --------- Co-authored-by: Shilei Tian <i@tianshilei.me> --- llvm/include/llvm/Transforms/IPO/Attributor.h | 11 + .../Transforms/IPO/AttributorAttributes.cpp | 61 +++++ .../X86/min-legal-vector-width.ll | 230 ++++++++++++------ .../Transforms/Attributor/align-ptrmask.ll | 206 ++++++++++++++++ .../Transforms/OpenMP/parallel_deletion.ll | 14 +- 5 files changed, 440 insertions(+), 82 deletions(-) create mode 100644 llvm/test/Transforms/Attributor/align-ptrmask.ll diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index a013f27766051..8c0342ae5cf12 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -5339,6 +5339,17 @@ struct AAPotentialConstantValues return nullptr; } + /// Return the minimum trailing zeros of potential constants + unsigned getAssumedMinTrailingZeros() const { + unsigned TrailingZeros = getAssumedSet().begin()->getBitWidth() + 1; + for (const APInt &It : getAssumedSet()) { + if (It.countTrailingZeros() < TrailingZeros) + TrailingZeros = It.countTrailingZeros(); + } + if (TrailingZeros > getAssumedSet().begin()->getBitWidth()) + return 0; + return TrailingZeros; + } /// See AbstractAttribute::getName() StringRef getName() const override { return "AAPotentialConstantValues"; } diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 5ed47aec08b25..a6ac7610a2c7a 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final // ------------------------ Align Argument Attribute ------------------------ namespace { + static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, Value &AssociatedValue, const Use *U, const Instruction *I, bool &TrackUse) { @@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, TrackUse = true; return 0; } + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + // Is it appropriate to pull attribute in initialization? + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE); + const auto *AlignAA = A.getAAFor<AAAlign>( + QueryingAA, IRPosition::value(*II), DepClassTy::NONE); + if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) { + unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Align ConstAlign(UINT64_C(1) << ShiftValue); + if (ConstAlign >= AlignAA->getKnownAlign()) + return Align(1).value(); + } + if (AlignAA) + return AlignAA->getKnownAlign().value(); + break; + } + default: + break; + } MaybeAlign MA; if (const auto *CB = dyn_cast<CallBase>(I)) { @@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {} + ChangeStatus updateImpl(Attributor &A) override { + Instruction *I = getIRPosition().getCtxI(); + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + Align Alignment; + bool Valid = false; + + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED); + if (ConstVals && ConstVals->isValidState()) { + unsigned ShiftValue = + std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Alignment = Align(UINT64_C(1) << ShiftValue); + Valid = true; + } + + const auto *AlignAA = + A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))), + DepClassTy::REQUIRED); + if (AlignAA && AlignAA->isValidState()) { + Alignment = std::max(AlignAA->getAssumedAlign(), Alignment); + Valid = true; + } + + if (Valid) + return clampStateAndIndicateChange<StateType>( + this->getState(), + std::min(this->getAssumedAlign(), Alignment).value()); + break; + } + default: + break; + } + } + return Base::updateImpl(A); + }; /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); } }; diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll index 649e9467c0318..fffe50fde1e50 100644 --- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll +++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll @@ -9,15 +9,25 @@ target triple = "x86_64-unknown-linux-gnu" ; This should promote define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0:![0-9]+]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -66,15 +76,25 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -123,15 +143,25 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -180,15 +210,25 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -237,13 +277,21 @@ bb: ; This should not promote define internal fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -290,13 +338,21 @@ bb: ; This should not promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr %arg, ptr readonly %arg1) #2 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -343,15 +399,25 @@ bb: ; This should promote define internal fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg, ptr readonly %arg1) #3 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -400,15 +466,25 @@ bb: ; This should promote define internal fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg, ptr readonly %arg1) #4 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -464,6 +540,14 @@ attributes #3 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2 attributes #4 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2" "min-legal-vector-width"="256" "prefer-vector-width"="256" } attributes #5 = { argmemonly nounwind } ;. +; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" } +; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" } +; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" } +; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" } +; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) } +; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn } +;. ; TUNIT: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" } ; TUNIT: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" } ; TUNIT: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" } @@ -472,11 +556,7 @@ attributes #5 = { argmemonly nounwind } ; TUNIT: attributes #[[ATTR5]] = { nofree willreturn memory(write) } ; TUNIT: attributes #[[ATTR6]] = { nofree nosync nounwind willreturn } ;. -; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" } -; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" } -; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" } -; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" } -; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } -; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) } -; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn } +; CGSCC: [[META0]] = !{} ;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/Attributor/align-ptrmask.ll b/llvm/test/Transforms/Attributor/align-ptrmask.ll new file mode 100644 index 0000000000000..008f5e1b8a46e --- /dev/null +++ b/llvm/test/Transforms/Attributor/align-ptrmask.ll @@ -0,0 +1,206 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=attributor -S < %s | FileCheck %s + +define ptr @align_ptrmask_back_no_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_no_prop( +; CHECK-SAME: ptr nofree writeonly align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 16 dereferenceable(4) ptr @align_ptrmask_back_prop( +; CHECK-SAME: ptr nofree writeonly align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 16 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 16 +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + store float 1.0, ptr %p, align 16 + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask(ptr align 2 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_mask( +; CHECK-SAME: ptr nofree readnone align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + ret ptr %p +} + +define ptr @align_ptrmask_forward_ptr(ptr align 16 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 16 ptr @align_ptrmask_forward_ptr( +; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + ret ptr %p +} + +define ptr @align_ptrmask_forward_nonconst_mask(ptr align 8 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_nonconst_mask( +; CHECK-SAME: ptr nofree readnone align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]] +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 %y + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + ret ptr %p +} + +define ptr @align_ptrmask_back_nonconst_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_nonconst_mask( +; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]] +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 %y + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_const_back_noprop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_noprop( +; CHECK-SAME: ptr nofree writeonly align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_const_back_prop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_prop( +; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -2) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -2) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_const_forward_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 8 ptr @align_ptrmask_back_const_forward_mask( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8) + ret ptr %p +} + +define ptr @align_ptrmask_back_const_forward_ptr(ptr align 16 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_const_forward_ptr( +; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8) + ret ptr %p +} + +; FIXME: The store will create AAAlign for %ptr1, +; but the attribute didn't propagate through extractelement, need propagate +define <2 x ptr> @ptrmask_v2p0_v2i64(<2 x ptr> align 2 %ptr, i64 %a) { +; CHECK-LABEL: define <2 x ptr> @ptrmask_v2p0_v2i64( +; CHECK-SAME: <2 x ptr> align 2 [[PTR:%.*]], i64 [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[RESULT:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PTR]], <2 x i64> noundef splat (i64 -8)) #[[ATTR4]] +; CHECK-NEXT: [[PTR1:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 0 +; CHECK-NEXT: [[PTR2:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 1 +; CHECK-NEXT: store i64 [[A]], ptr [[PTR1]], align 16 +; CHECK-NEXT: store i64 [[A]], ptr [[PTR2]], align 16 +; CHECK-NEXT: ret <2 x ptr> [[RESULT]] +; + %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> splat(i64 -8)) + %ptr1 = extractelement <2 x ptr> %result, i32 0 + %ptr2 = extractelement <2 x ptr> %result, i32 1 + store i64 %a, ptr %ptr1, align 16 + store i64 %a, ptr %ptr2, align 16 + ret <2 x ptr> %result +} + +define ptr @align_ptrmask_forward_mask_positive(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_positive( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef 2) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 2) + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask_poison(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_poison( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 poison) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 poison) + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask_max(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4294967296) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4294967296) + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask_max_plus_one(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max_plus_one( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8589934592) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8589934592) + ret ptr %p +} + +define ptr @align_ptrmask_back_callsite(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_callsite( +; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4) + ret ptr %p +} diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll index 67970c41f765e..0b6c4f32772f5 100644 --- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll +++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll @@ -385,7 +385,7 @@ define internal void @.omp_outlined..4(ptr noalias %.global_tid., ptr noalias %. ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4 ; CHECK-SAME: (ptr noalias nofree noundef nonnull readonly align 4 captures(none) dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1:![0-9]+]] ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]]) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]] @@ -458,7 +458,7 @@ define internal void @.omp_outlined..5(ptr noalias %.global_tid., ptr noalias %. ; CHECK-SAME: (ptr noalias nofree readonly captures(none) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr noundef nonnull @[[GLOB0]]) #[[ATTR19]] -; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]] ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]]) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]] @@ -534,7 +534,7 @@ define internal void @.omp_outlined..6(ptr noalias %.global_tid., ptr noalias %. ; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr noundef nonnull align 4 [[A1]]) #[[ATTR20:[0-9]+]] ; CHECK-NEXT: store i32 1, ptr [[A1]], align 4 ; CHECK-NEXT: store ptr [[A1]], ptr [[DOTOMP_REDUCTION_RED_LIST]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]] ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var) ; CHECK-NEXT: switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ ; CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] @@ -646,10 +646,10 @@ define internal void @.omp.reduction.reduction_func(ptr %arg, ptr %arg1) { ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func ; CHECK-SAME: (ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG:%.*]], ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG1:%.*]]) #[[ATTR10:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8, !invariant.load [[META1]] +; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8, !invariant.load [[META1]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !invariant.load [[META1]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4, !invariant.load [[META1]] ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 ; CHECK-NEXT: ret void From a3a99c3996ffa2abf7e2b1e4abeaa933830f2ac3 Mon Sep 17 00:00:00 2001 From: quic-k <kushpal@qti.qualcomm.com> Date: Tue, 4 Nov 2025 10:05:31 +0530 Subject: [PATCH 120/313] [compiler-rt][x86] Don't use assert.h when building without a libc (#165384) fixes https://github.com/llvm/llvm-project/issues/164932 Signed-off-by: Kushal Pal <kushpal@qti.qualcomm.com> Co-authored-by: Saleem Abdulrasool <compnerd@compnerd.org> --- compiler-rt/lib/builtins/cpu_model/x86.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c index c21b2bad1d212..45b7055abf454 100644 --- a/compiler-rt/lib/builtins/cpu_model/x86.c +++ b/compiler-rt/lib/builtins/cpu_model/x86.c @@ -21,7 +21,9 @@ #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) +#if __STDC_HOSTED__ #include <assert.h> +#endif // __STDC_HOSTED__ #if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER) #include <cpuid.h> @@ -245,8 +247,8 @@ struct __processor_model { unsigned int __cpu_features[1]; } __cpu_model = {0, 0, 0, {0}}; -static_assert(sizeof(__cpu_model) == 16, - "Wrong size of __cpu_model will result in ABI break"); +_Static_assert(sizeof(__cpu_model) == 16, + "Wrong size of __cpu_model will result in ABI break"); // This code is copied from lib/Support/Host.cpp. // Changes to either file should be mirrored in the other. @@ -1200,8 +1202,8 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) { unsigned Vendor; unsigned Model, Family; unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0}; - static_assert(sizeof(Features) / sizeof(Features[0]) == 4, ""); - static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, ""); + _Static_assert(sizeof(Features) / sizeof(Features[0]) == 4, ""); + _Static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, ""); // This function needs to run just once. if (__cpu_model.__cpu_vendor) @@ -1234,9 +1236,11 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) { } else __cpu_model.__cpu_vendor = VENDOR_OTHER; +#if __STDC_HOSTED__ assert(__cpu_model.__cpu_vendor < VENDOR_MAX); assert(__cpu_model.__cpu_type < CPU_TYPE_MAX); assert(__cpu_model.__cpu_subtype < CPU_SUBTYPE_MAX); +#endif // __STDC_HOSTED__ return 0; } From 57730f6cdb32654c54e0cd4d535ab7a3b68ef7ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com> Date: Mon, 3 Nov 2025 18:37:50 -1000 Subject: [PATCH 121/313] [flang][cuda] Switch to inline ptx for barrier_arrive (#166261) --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 13 ++++++------- flang/test/Lower/CUDA/cuda-device-proc.cuf | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 6ebd52dcd42ea..2fefad1447132 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -3386,13 +3386,12 @@ IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, assert(args.size() == 2); mlir::Value barrier = convertPtrToNVVMSpace( builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); - mlir::Value token = fir::AllocaOp::create(builder, loc, resultType); - // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and - // currently just the sink symbol `_`. - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive - mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1], - {}); - return fir::LoadOp::create(builder, loc, token); + return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType}, + {barrier, args[1]}, {}, + "mbarrier.arrive.expect_tx.release." + "cta.shared::cta.b64 %0, [%1], %2;", + {}) + .getResult(0); } // BARRIER_INIT (CUDA) diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 666c394ad6678..038aa0a058277 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -444,7 +444,7 @@ end subroutine ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> -! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %{{.*}}, [%{{.*}}], %{{.*}};" ro(%{{.*}}, %{{.*}} : !llvm.ptr<3>, i32) -> i64 attributes(global) subroutine test_fence() From 95d6caa5d43688d60874cd16eb6c06605000f2a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com> Date: Mon, 3 Nov 2025 18:40:18 -1000 Subject: [PATCH 122/313] [flang][cuda] Add interfaces and lowering for atomicaddvector (#166275) --- .../flang/Optimizer/Builder/IntrinsicCall.h | 2 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 49 +++++++++++++++++++ flang/module/cudadevice.f90 | 16 ++++++ flang/test/Lower/CUDA/cuda-atomicadd.cuf | 19 +++++++ 4 files changed, 86 insertions(+) create mode 100644 flang/test/Lower/CUDA/cuda-atomicadd.cuf diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 9f15ce68eb3d5..bbdef481a2085 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -190,6 +190,8 @@ struct IntrinsicLibrary { mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genAtomicAddR2(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); + fir::ExtendedValue genAtomicAddVector(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genAtomicCas(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 2fefad1447132..b9ea8b125b780 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -290,6 +290,14 @@ static constexpr IntrinsicHandler handlers[]{ {"atan2pi", &I::genAtanpi}, {"atand", &I::genAtand}, {"atanpi", &I::genAtanpi}, + {"atomicadd_r2x2", + &I::genAtomicAddVector, + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicadd_r4x2", + &I::genAtomicAddVector, + {{{"a", asAddr}, {"v", asAddr}}}, + false}, {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, @@ -3168,6 +3176,47 @@ IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, mlir::ArrayRef<int64_t>{0}); } +fir::ExtendedValue +IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value res = fir::AllocaOp::create( + builder, loc, fir::SequenceType::get({2}, resultType)); + mlir::Value a = fir::getBase(args[0]); + if (mlir::isa<fir::BaseBoxType>(a.getType())) { + a = fir::BoxAddrOp::create(builder, loc, a); + } + auto vecTy = mlir::VectorType::get({2}, resultType); + auto refTy = fir::ReferenceType::get(resultType); + mlir::Type i32Ty = builder.getI32Type(); + mlir::Type idxTy = builder.getIndexType(); + mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0); + mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); + mlir::Value v1Coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), zero); + mlir::Value v2Coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), one); + mlir::Value v1 = fir::LoadOp::create(builder, loc, v1Coord); + mlir::Value v2 = fir::LoadOp::create(builder, loc, v2Coord); + mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); + mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); + mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( + builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); + mlir::Value add = + genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); + mlir::Value r1 = mlir::LLVM::ExtractElementOp::create( + builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 0)); + mlir::Value r2 = mlir::LLVM::ExtractElementOp::create( + builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 1)); + mlir::Value c1 = fir::CoordinateOp::create(builder, loc, refTy, res, zero); + mlir::Value c2 = fir::CoordinateOp::create(builder, loc, refTy, res, one); + fir::StoreOp::create(builder, loc, r1, c1); + fir::StoreOp::create(builder, loc, r2, c2); + mlir::Value ext = builder.createIntegerConstant(loc, idxTy, 2); + return fir::ArrayBoxValue(res, {ext}); +} + mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { assert(args.size() == 2); diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 7a764b589dc56..b1aef95cba8c9 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1178,6 +1178,22 @@ attributes(device) pure integer(4) function atomicaddr2(address, val) end function end interface + interface atomicaddvector + attributes(device) pure function atomicadd_r2x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(2), dimension(2), intent(inout) :: address + real(2), dimension(2), intent(in) :: val + real(2), dimension(2) :: z + end function + + attributes(device) pure function atomicadd_r4x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(2), intent(inout) :: address + real(4), dimension(2), intent(in) :: val + real(4), dimension(2) :: z + end function + end interface + interface atomicsub attributes(device) pure integer function atomicsubi(address, val) !dir$ ignore_tkr (d) address, (d) val diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf new file mode 100644 index 0000000000000..baa6cdb3d5869 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf @@ -0,0 +1,19 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test CUDA Fortran atmoicadd functions available cudadevice module + +attributes(global) subroutine atomicaddvector_r2() + real(2), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicAddVector(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPatomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16> + +attributes(global) subroutine atomicaddvector_r4() + real(4), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicAddVector(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPatomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32> From 993a38fa539d23f83711a0e07d3cc40a0947ec7e Mon Sep 17 00:00:00 2001 From: Lee Wei <lee10202013@gmail.com> Date: Mon, 3 Nov 2025 20:52:51 -0800 Subject: [PATCH 123/313] [MLIR][Affine] Extend getVectorReductionOp to support xor/maxnumf/minnumf (#163310) This PR extends the `getVectorReductionOp` function, which is used by the affine vectorizer, to also recognize and support `xor/maxnumf/minnumf` reduction operations. --- .../Affine/Analysis/AffineAnalysis.cpp | 5 +- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 10 +- .../Conversion/ConvertToSPIRV/vector.mlir | 36 +++++++ .../SuperVectorize/vectorize_reduction.mlir | 100 ++++++++++++++++++ 4 files changed, 148 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp index 4d2d8738aa4ad..3d1a73417d1ea 100644 --- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp @@ -66,9 +66,10 @@ static Value getSupportedReduction(AffineForOp forOp, unsigned pos, .Case([](arith::MaxSIOp) { return arith::AtomicRMWKind::maxs; }) .Case([](arith::MinUIOp) { return arith::AtomicRMWKind::minu; }) .Case([](arith::MaxUIOp) { return arith::AtomicRMWKind::maxu; }) + .Case([](arith::XOrIOp) { return arith::AtomicRMWKind::xori; }) + .Case([](arith::MaxNumFOp) { return arith::AtomicRMWKind::maxnumf; }) + .Case([](arith::MinNumFOp) { return arith::AtomicRMWKind::minnumf; }) .Default([](Operation *) -> std::optional<arith::AtomicRMWKind> { - // TODO: AtomicRMW supports other kinds of reductions this is - // currently not detecting, add those when the need arises. return std::nullopt; }); if (!maybeKind) diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index ae3423c40040d..daef0ba02100a 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -717,7 +717,15 @@ Value mlir::vector::getVectorReductionOp(arith::AtomicRMWKind op, case arith::AtomicRMWKind::ori: return vector::ReductionOp::create(builder, vector.getLoc(), CombiningKind::OR, vector); - // TODO: Add remaining reduction operations. + case arith::AtomicRMWKind::minnumf: + return vector::ReductionOp::create(builder, vector.getLoc(), + CombiningKind::MINNUMF, vector); + case arith::AtomicRMWKind::maxnumf: + return vector::ReductionOp::create(builder, vector.getLoc(), + CombiningKind::MAXNUMF, vector); + case arith::AtomicRMWKind::xori: + return vector::ReductionOp::create(builder, vector.getLoc(), + CombiningKind::XOR, vector); default: (void)emitOptionalError(loc, "Reduction operation type not supported"); break; diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir index a75f30d57fa74..cd8cfc8736915 100644 --- a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir +++ b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir @@ -275,6 +275,42 @@ func.func @reduction_minimumf(%v : vector<3xf32>, %s: f32) -> f32 { // ----- +// CHECK-LABEL: spirv.func @reduction_minnumf( +// CHECK-SAME: %[[V:.*]]: vector<3xf32>, +// CHECK-SAME: %[[S:.*]]: f32) -> f32 "None" { +// CHECK: %[[S0:.*]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32> +// CHECK: %[[S1:.*]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32> +// CHECK: %[[S2:.*]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32> +// CHECK: %[[MIN0:.*]] = spirv.GL.FMin %[[S0]], %[[S1]] : f32 +// CHECK: %[[MIN1:.*]] = spirv.GL.FMin %[[MIN0]], %[[S2]] : f32 +// CHECK: %[[MIN2:.*]] = spirv.GL.FMin %[[MIN1]], %[[S]] : f32 +// CHECK: spirv.ReturnValue %[[MIN2]] : f32 +// CHECK: } +func.func @reduction_minnumf(%v : vector<3xf32>, %s: f32) -> f32 { + %reduce = vector.reduction <minnumf>, %v, %s : vector<3xf32> into f32 + return %reduce : f32 +} + +// ----- + +// CHECK-LABEL: spirv.func @reduction_maxnumf( +// CHECK-SAME: %[[V:.*]]: vector<3xf32>, +// CHECK-SAME: %[[S:.*]]: f32) -> f32 "None" { +// CHECK: %[[S0:.*]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32> +// CHECK: %[[S1:.*]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32> +// CHECK: %[[S2:.*]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32> +// CHECK: %[[MAX0:.*]] = spirv.GL.FMax %[[S0]], %[[S1]] : f32 +// CHECK: %[[MAX1:.*]] = spirv.GL.FMax %[[MAX0]], %[[S2]] : f32 +// CHECK: %[[MAX2:.*]] = spirv.GL.FMax %[[MAX1]], %[[S]] : f32 +// CHECK: spirv.ReturnValue %[[MAX2]] : f32 +// CHECK: } +func.func @reduction_maxnumf(%v : vector<3xf32>, %s: f32) -> f32 { + %reduce = vector.reduction <maxnumf>, %v, %s : vector<3xf32> into f32 + return %reduce : f32 +} + +// ----- + // CHECK-LABEL: func @reduction_maxsi // CHECK-SAME: (%[[V:.+]]: vector<3xi32>, %[[S:.+]]: i32) // CHECK: %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xi32> diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir index b616632a6fe24..b062736575ad7 100644 --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir @@ -243,6 +243,106 @@ func.func @vecdim_reduction_ori(%in: memref<256x512xi32>, %out: memref<256xi32>) // CHECK: affine.store %[[final_red]], %{{.*}} : memref<256xi32> // CHECK: } +// ----- + +func.func @vecdim_reduction_xori(%in: memref<256x512xi32>, %out: memref<256xi32>) { + %cst = arith.constant 0 : i32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { + %ld = affine.load %in[%i, %j] : memref<256x512xi32> + %xor = arith.xori %red_iter, %ld : i32 + affine.yield %xor : i32 + } + affine.store %final_red, %out[%i] : memref<256xi32> + } + return +} + +// CHECK-LABEL: func.func @vecdim_reduction_xori( +// CHECK-SAME: %[[input:.*]]: memref<256x512xi32>, +// CHECK-SAME: %[[output:.*]]: memref<256xi32>) { +// CHECK: %[[cst:.*]] = arith.constant 0 : i32 +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vzero:.*]] = arith.constant dense<0> : vector<128xi32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xi32>) { +// CHECK: %[[poison:.*]] = ub.poison : i32 +// CHECK: %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xi32>, vector<128xi32> +// CHECK: %[[xor:.*]] = arith.xori %[[red_iter]], %[[ld]] : vector<128xi32> +// CHECK: affine.yield %[[xor]] : vector<128xi32> +// CHECK: } +// CHECK: %[[final_red:.*]] = vector.reduction <xor>, %[[vred]] : vector<128xi32> into i32 +// CHECK: affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xi32> +// CHECK: } +// CHECK: return +// CHECK: } + +// ----- + +func.func @vecdim_reduction_minnumf(%in: memref<256x512xf32>, %out: memref<256xf32>) { + %cst = arith.constant 0xFF800000 : f32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { + %ld = affine.load %in[%i, %j] : memref<256x512xf32> + %min = arith.minnumf %red_iter, %ld : f32 + affine.yield %min : f32 + } + affine.store %final_red, %out[%i] : memref<256xf32> + } + return +} + +// CHECK-LABEL: func.func @vecdim_reduction_minnumf( +// CHECK-SAME: %[[input:.*]]: memref<256x512xf32>, +// CHECK-SAME: %[[output:.*]]: memref<256xf32>) { +// CHECK: %[[cst:.*]] = arith.constant 0xFF800000 : f32 +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vzero:.*]] = arith.constant dense<0x7FC00000> : vector<128xf32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { +// CHECK: %[[poison:.*]] = ub.poison : f32 +// CHECK: %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xf32>, vector<128xf32> +// CHECK: %[[min:.*]] = arith.minnumf %[[red_iter]], %[[ld]] : vector<128xf32> +// CHECK: affine.yield %[[min]] : vector<128xf32> +// CHECK: } +// CHECK: %[[red_scalar:.*]] = vector.reduction <minnumf>, %[[vred]] : vector<128xf32> into f32 +// CHECK: %[[final_red:.*]] = arith.minnumf %[[red_scalar]], %[[cst]] : f32 +// CHECK: affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xf32> +// CHECK: } +// CHECK: return +// CHECK: } + +// ----- + +func.func @vecdim_reduction_maxnumf(%in: memref<256x512xf32>, %out: memref<256xf32>) { + %cst = arith.constant 0xFF800000 : f32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { + %ld = affine.load %in[%i, %j] : memref<256x512xf32> + %max = arith.maxnumf %red_iter, %ld : f32 + affine.yield %max : f32 + } + affine.store %final_red, %out[%i] : memref<256xf32> + } + return +} + +// CHECK-LABEL: func.func @vecdim_reduction_maxnumf( +// CHECK-SAME: %[[input:.*]]: memref<256x512xf32>, +// CHECK-SAME: %[[output:.*]]: memref<256xf32>) { +// CHECK: %[[cst:.*]] = arith.constant 0xFF800000 : f32 +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vzero:.*]] = arith.constant dense<0xFFC00000> : vector<128xf32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { +// CHECK: %[[poison:.*]] = ub.poison : f32 +// CHECK: %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xf32>, vector<128xf32> +// CHECK: %[[max:.*]] = arith.maxnumf %[[red_iter]], %[[ld]] : vector<128xf32> +// CHECK: affine.yield %[[max]] : vector<128xf32> +// CHECK: } +// CHECK: %[[red_scalar:.*]] = vector.reduction <maxnumf>, %[[vred]] : vector<128xf32> into f32 +// CHECK: %[[final_red:.*]] = arith.maxnumf %[[red_scalar]], %[[cst]] : f32 +// CHECK: affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xf32> +// CHECK: } +// CHECK: return +// CHECK: } // ----- From 93e860e694770f52a9eeecda88ba11173c291ef8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Mon, 3 Nov 2025 21:09:56 -0800 Subject: [PATCH 124/313] IR: Remove null UseList checks in hasNUses methods (#165929) There do not appear to be any cases where this is used. This does introduce an odd asyemmtry where use_empty is not equivalent to hasNUses(0). --- llvm/lib/IR/Value.cpp | 8 ------- llvm/unittests/IR/ConstantsTest.cpp | 36 +++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index b775cbb0c7920..95d61a987f6c1 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -148,18 +148,10 @@ void Value::destroyValueName() { } bool Value::hasNUses(unsigned N) const { - if (!UseList) - return N == 0; - - // TODO: Disallow for ConstantData and remove !UseList check? return hasNItems(use_begin(), use_end(), N); } bool Value::hasNUsesOrMore(unsigned N) const { - // TODO: Disallow for ConstantData and remove !UseList check? - if (!UseList) - return N == 0; - return hasNItemsOrMore(use_begin(), use_end(), N); } diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp index 6376165cbe766..9cb9e1236b2d1 100644 --- a/llvm/unittests/IR/ConstantsTest.cpp +++ b/llvm/unittests/IR/ConstantsTest.cpp @@ -29,13 +29,8 @@ TEST(ConstantsTest, UseCounts) { EXPECT_TRUE(Zero->use_empty()); EXPECT_EQ(Zero->getNumUses(), 0u); - EXPECT_TRUE(Zero->hasNUses(0)); EXPECT_FALSE(Zero->hasOneUse()); EXPECT_FALSE(Zero->hasOneUser()); - EXPECT_FALSE(Zero->hasNUses(1)); - EXPECT_FALSE(Zero->hasNUsesOrMore(1)); - EXPECT_FALSE(Zero->hasNUses(2)); - EXPECT_FALSE(Zero->hasNUsesOrMore(2)); std::unique_ptr<Module> M(new Module("MyModule", Context)); @@ -50,15 +45,36 @@ TEST(ConstantsTest, UseCounts) { // Still looks like use_empty with uses. EXPECT_TRUE(Zero->use_empty()); EXPECT_EQ(Zero->getNumUses(), 0u); - EXPECT_TRUE(Zero->hasNUses(0)); EXPECT_FALSE(Zero->hasOneUse()); EXPECT_FALSE(Zero->hasOneUser()); - EXPECT_FALSE(Zero->hasNUses(1)); - EXPECT_FALSE(Zero->hasNUsesOrMore(1)); - EXPECT_FALSE(Zero->hasNUses(2)); - EXPECT_FALSE(Zero->hasNUsesOrMore(2)); } +#ifdef GTEST_HAS_DEATH_TEST +#ifndef NDEBUG + +TEST(ConstantsTest, hasNUsesInvalid) { + LLVMContext Context; + Type *Int32Ty = Type::getInt32Ty(Context); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + std::unique_ptr<Module> M(new Module("MyModule", Context)); + + // Introduce some uses + new GlobalVariable(*M, Int32Ty, /*isConstant=*/false, + GlobalValue::ExternalLinkage, /*Initializer=*/Zero, + "gv_user0"); + new GlobalVariable(*M, Int32Ty, /*isConstant=*/false, + GlobalValue::ExternalLinkage, /*Initializer=*/Zero, + "gv_user1"); + + for (int I = 0; I != 3; ++I) { + EXPECT_DEATH(Zero->hasNUses(I), "hasUseList\\(\\)"); + EXPECT_DEATH(Zero->hasNUsesOrMore(I), "hasUseList\\(\\)"); + } +} + +#endif +#endif + TEST(ConstantsTest, Integer_i1) { LLVMContext Context; IntegerType *Int1 = IntegerType::get(Context, 1); From 0ba7bfc34fa292a34b31f6338464744b5cad58a3 Mon Sep 17 00:00:00 2001 From: Yingying Wang <3171290993@qq.com> Date: Tue, 4 Nov 2025 13:25:49 +0800 Subject: [PATCH 125/313] [DFAJumpThreading] Enable DFAJumpThread by default. (#157646) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We recommend setting `dfa-jump-thread` to be enabled by default. It’s a mature optimization that’s been supported since GCC 9.1.0. At the `-O2` opt level, both the GCC and ICX compilers have this optimization enabled by default. Once it’s enabled, we saw a **13% performance improvement** in the CoreMark benchmark on the X86 platform (Intel i9-11900K Rocket Lake), and even a **15% increase** on the KunMingHu FPGA. Additionally, we verified the correctness of this pass using SPEC 2017. Co-authored-by: YinZd <104915588+nothiny@users.noreply.github.com> Co-authored-by: ict-ql <168183727+ict-ql@users.noreply.github.com> Co-authored-by: Lin Wang <wanglulin@ict.ac.cn> --- llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +- llvm/test/Other/new-pm-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-postlink-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-prelink-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll | 1 + llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll | 1 + 8 files changed, 8 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index bd03ac090721c..3f41618b18fcf 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -228,7 +228,7 @@ static cl::opt<bool> EnableLoopHeaderDuplication( static cl::opt<bool> EnableDFAJumpThreading("enable-dfa-jump-thread", cl::desc("Enable DFA jump threading"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt<bool> EnableHotColdSplit("hot-cold-split", diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index 65b96c8b8ef5d..62975a3cf8ac4 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -208,6 +208,7 @@ ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll index 3a0fffe426da1..012a1ab5802b5 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -133,6 +133,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll index 4623edcaf6656..e021ff3124b60 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -118,6 +118,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll index 590afd925e841..20f94bc2e0f6c 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -127,6 +127,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll index dd6acd2c51ee7..b61edc805108d 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll @@ -165,6 +165,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll index ee054527e20bd..acf8c053d0f82 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll @@ -167,6 +167,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll index fd95e94f3c8b9..6b3c5ca7605de 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll @@ -131,6 +131,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass From 52fdcf94a39a811476654e23176c6ffa69ff7287 Mon Sep 17 00:00:00 2001 From: Piotr Fusik <p.fusik@samsung.com> Date: Tue, 4 Nov 2025 07:55:31 +0100 Subject: [PATCH 126/313] [RISCV][NFC] Match `3/5/9 * 3/5/9 * 2^N` without a loop (#165547) #158851 matches `3/5/9 * 3/5/9` with a `switch`. Reuse it for the shifted case to improve compilation time. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 99 +++++++++------------ 1 file changed, 43 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e0cf739f67d9b..c56ce3fd2a5a4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -16495,6 +16495,35 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Op, DL, VT, Shift1, Shift2); } +static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX, + unsigned ShY) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue X = N->getOperand(0); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getConstant(ShY, DL, VT), X); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, + DAG.getConstant(ShX, DL, VT), Mul359); +} + +static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG, + uint64_t MulAmt) { + switch (MulAmt) { + case 5 * 3: + return getShlAddShlAdd(N, DAG, 2, 1); + case 9 * 3: + return getShlAddShlAdd(N, DAG, 3, 1); + case 5 * 5: + return getShlAddShlAdd(N, DAG, 2, 2); + case 9 * 5: + return getShlAddShlAdd(N, DAG, 3, 2); + case 9 * 9: + return getShlAddShlAdd(N, DAG, 3, 3); + default: + return SDValue(); + } +} + // Try to expand a scalar multiply to a faster sequence. static SDValue expandMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -16524,18 +16553,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue())) return SDValue(); - // WARNING: The code below is knowingly incorrect with regards to undef semantics. - // We're adding additional uses of X here, and in principle, we should be freezing - // X before doing so. However, adding freeze here causes real regressions, and no - // other target properly freezes X in these cases either. - SDValue X = N->getOperand(0); - + // WARNING: The code below is knowingly incorrect with regards to undef + // semantics. We're adding additional uses of X here, and in principle, we + // should be freezing X before doing so. However, adding freeze here causes + // real regressions, and no other target properly freezes X in these cases + // either. if (Subtarget.hasShlAdd(3)) { + SDValue X = N->getOperand(0); int Shift; if (int ShXAmount = isShifted359(MulAmt, Shift)) { // 3/5/9 * 2^N -> shl (shXadd X, X), N SDLoc DL(N); - SDValue X = N->getOperand(0); // Put the shift first if we can fold a zext into the shift forming // a slli.uw. if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && @@ -16554,38 +16582,8 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, } // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - int ShX; - int ShY; - switch (MulAmt) { - case 3 * 5: - ShY = 1; - ShX = 2; - break; - case 3 * 9: - ShY = 1; - ShX = 3; - break; - case 5 * 5: - ShX = ShY = 2; - break; - case 5 * 9: - ShY = 2; - ShX = 3; - break; - case 9 * 9: - ShX = ShY = 3; - break; - default: - ShX = ShY = 0; - break; - } - if (ShX) { - SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShY, DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(ShX, DL, VT), Mul359); - } + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt)) + return V; // If this is a power 2 + 2/4/8, we can use a shift followed by a single // shXadd. First check if this a sum of two power of 2s because that's @@ -16648,23 +16646,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, } } - for (uint64_t Divisor : {3, 5, 9}) { - if (MulAmt % Divisor != 0) - continue; - uint64_t MulAmt2 = MulAmt / Divisor; - // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples - // of 25 which happen to be quite common. - if (int ShBAmount = isShifted359(MulAmt2, Shift)) { - SDLoc DL(N); - SDValue Mul359A = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - SDValue Mul359B = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A, - DAG.getConstant(ShBAmount, DL, VT), Mul359A); - return DAG.getNode(ISD::SHL, DL, VT, Mul359B, - DAG.getConstant(Shift, DL, VT)); - } + // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples + // of 25 which happen to be quite common. + Shift = llvm::countr_zero(MulAmt); + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) { + SDLoc DL(N); + return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT)); } } From 546a783d81d31a6a2d83a8b92c88223756a16c8d Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Tue, 4 Nov 2025 15:01:18 +0800 Subject: [PATCH 127/313] [Attributor] Fix sanitizer for getAssumedMinTrailingZeros() --- llvm/include/llvm/Transforms/IPO/Attributor.h | 2 ++ llvm/test/lit.cfg.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index 8c0342ae5cf12..eb35e3644bd02 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -5341,6 +5341,8 @@ struct AAPotentialConstantValues /// Return the minimum trailing zeros of potential constants unsigned getAssumedMinTrailingZeros() const { + if (!isValidState() || getAssumedSet().empty()) + return 0; unsigned TrailingZeros = getAssumedSet().begin()->getBitWidth() + 1; for (const APInt &It : getAssumedSet()) { if (It.countTrailingZeros() < TrailingZeros) diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index cadf781b409be..89f1ca6935cff 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -584,7 +584,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("ascii") + readobj_out = readobj_cmd.stdout.read().decode("utf-8") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") From 04619db16ba840f1af2ad66f1d3eda9dc086a059 Mon Sep 17 00:00:00 2001 From: quic_hchandel <hchandel@qti.qualcomm.com> Date: Tue, 4 Nov 2025 12:38:50 +0530 Subject: [PATCH 128/313] [RISCV] Add short forward branch support for `mul` instruction (#166300) --- .../Target/RISCV/RISCVExpandPseudoInsts.cpp | 2 + llvm/lib/Target/RISCV/RISCVFeatures.td | 5 + llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 4 + llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td | 1 + llvm/test/CodeGen/RISCV/features-info.ll | 1 + .../RISCV/short-forward-branch-opt-mul.ll | 156 ++++++++++++++++++ 6 files changed, 169 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 526675a682d86..b0453fc57c053 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -131,6 +131,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoCCMAXU: case RISCV::PseudoCCMIN: case RISCV::PseudoCCMINU: + case RISCV::PseudoCCMUL: case RISCV::PseudoCCADDW: case RISCV::PseudoCCSUBW: case RISCV::PseudoCCSLL: @@ -237,6 +238,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCMIN: NewOpc = RISCV::MIN; break; case RISCV::PseudoCCMAXU: NewOpc = RISCV::MAXU; break; case RISCV::PseudoCCMINU: NewOpc = RISCV::MINU; break; + case RISCV::PseudoCCMUL: NewOpc = RISCV::MUL; break; case RISCV::PseudoCCADDI: NewOpc = RISCV::ADDI; break; case RISCV::PseudoCCSLLI: NewOpc = RISCV::SLLI; break; case RISCV::PseudoCCSRLI: NewOpc = RISCV::SRLI; break; diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index cfee6ab22d4ff..5b72334f58d45 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1856,6 +1856,11 @@ def TuneShortForwardBranchIMinMax "true", "Enable short forward branch optimization for min,max instructions in Zbb", [TuneShortForwardBranchOpt]>; +def TuneShortForwardBranchIMul + : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul", + "true", "Enable short forward branch optimization for mul instruction", + [TuneShortForwardBranchOpt]>; + // Some subtargets require a S2V transfer buffer to move scalars into vectors. // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure. def TuneNoSinkSplatOperands diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index c9df787e0012d..b8ab70bd9e386 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1703,6 +1703,7 @@ unsigned getPredicatedOpcode(unsigned Opcode) { case RISCV::MAXU: return RISCV::PseudoCCMAXU; case RISCV::MIN: return RISCV::PseudoCCMIN; case RISCV::MINU: return RISCV::PseudoCCMINU; + case RISCV::MUL: return RISCV::PseudoCCMUL; case RISCV::ADDI: return RISCV::PseudoCCADDI; case RISCV::SLLI: return RISCV::PseudoCCSLLI; @@ -1754,6 +1755,9 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg, MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU)) return nullptr; + if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL) + return nullptr; + // Check if MI can be predicated and folded into the CCMOV. if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END) return nullptr; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 5a67a5aaba293..494b1c9f98839 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -110,6 +110,7 @@ def PseudoCCMAX : SFBALU_rr; def PseudoCCMIN : SFBALU_rr; def PseudoCCMAXU : SFBALU_rr; def PseudoCCMINU : SFBALU_rr; +def PseudoCCMUL : SFBALU_rr; def PseudoCCADDI : SFBALU_ri; def PseudoCCANDI : SFBALU_ri; diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 988d0490afeb6..cf44af608542c 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -137,6 +137,7 @@ ; CHECK-NEXT: shifted-zextw-fusion - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension. ; CHECK-NEXT: shlcofideleg - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode). ; CHECK-NEXT: short-forward-branch-i-minmax - Enable short forward branch optimization for min,max instructions in Zbb. +; CHECK-NEXT: short-forward-branch-i-mul - Enable short forward branch optimization for mul instruction. ; CHECK-NEXT: short-forward-branch-opt - Enable short forward branch optimization. ; CHECK-NEXT: shtvala - 'Shtvala' (htval provides all needed values). ; CHECK-NEXT: shvsatpa - 'Shvsatpa' (vsatp supports all modes supported by satp). diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll new file mode 100644 index 0000000000000..3f780fddafcce --- /dev/null +++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m | FileCheck %s --check-prefixes=RV32I-M +; RUN: llc < %s -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefixes=RV64I-M +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFB-M +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFB-M +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-i-mul | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFBIMul-M +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-i-mul | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFBIMul-M + +define i32 @select_example_mul_i32(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-M-LABEL: select_example_mul_i32: +; RV32I-M: # %bb.0: # %entry +; RV32I-M-NEXT: beqz a2, .LBB0_2 +; RV32I-M-NEXT: # %bb.1: +; RV32I-M-NEXT: mul a1, a0, a3 +; RV32I-M-NEXT: .LBB0_2: # %entry +; RV32I-M-NEXT: mv a0, a1 +; RV32I-M-NEXT: ret +; +; RV64I-M-LABEL: select_example_mul_i32: +; RV64I-M: # %bb.0: # %entry +; RV64I-M-NEXT: beqz a2, .LBB0_2 +; RV64I-M-NEXT: # %bb.1: +; RV64I-M-NEXT: mulw a1, a0, a3 +; RV64I-M-NEXT: .LBB0_2: # %entry +; RV64I-M-NEXT: mv a0, a1 +; RV64I-M-NEXT: ret +; +; RV32I-SFB-M-LABEL: select_example_mul_i32: +; RV32I-SFB-M: # %bb.0: # %entry +; RV32I-SFB-M-NEXT: mul a0, a0, a3 +; RV32I-SFB-M-NEXT: bnez a2, .LBB0_2 +; RV32I-SFB-M-NEXT: # %bb.1: # %entry +; RV32I-SFB-M-NEXT: mv a0, a1 +; RV32I-SFB-M-NEXT: .LBB0_2: # %entry +; RV32I-SFB-M-NEXT: ret +; +; RV64I-SFB-M-LABEL: select_example_mul_i32: +; RV64I-SFB-M: # %bb.0: # %entry +; RV64I-SFB-M-NEXT: mulw a0, a0, a3 +; RV64I-SFB-M-NEXT: bnez a2, .LBB0_2 +; RV64I-SFB-M-NEXT: # %bb.1: # %entry +; RV64I-SFB-M-NEXT: mv a0, a1 +; RV64I-SFB-M-NEXT: .LBB0_2: # %entry +; RV64I-SFB-M-NEXT: ret +; +; RV32I-SFBIMul-M-LABEL: select_example_mul_i32: +; RV32I-SFBIMul-M: # %bb.0: # %entry +; RV32I-SFBIMul-M-NEXT: beqz a2, .LBB0_2 +; RV32I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV32I-SFBIMul-M-NEXT: mul a1, a0, a3 +; RV32I-SFBIMul-M-NEXT: .LBB0_2: # %entry +; RV32I-SFBIMul-M-NEXT: mv a0, a1 +; RV32I-SFBIMul-M-NEXT: ret +; +; RV64I-SFBIMul-M-LABEL: select_example_mul_i32: +; RV64I-SFBIMul-M: # %bb.0: # %entry +; RV64I-SFBIMul-M-NEXT: mulw a0, a0, a3 +; RV64I-SFBIMul-M-NEXT: bnez a2, .LBB0_2 +; RV64I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV64I-SFBIMul-M-NEXT: mv a0, a1 +; RV64I-SFBIMul-M-NEXT: .LBB0_2: # %entry +; RV64I-SFBIMul-M-NEXT: ret +entry: + %res = mul i32 %a, %y + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i64 @select_example_mul_i64(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-M-LABEL: select_example_mul_i64: +; RV32I-M: # %bb.0: # %entry +; RV32I-M-NEXT: beqz a4, .LBB1_2 +; RV32I-M-NEXT: # %bb.1: +; RV32I-M-NEXT: mul a2, a0, a6 +; RV32I-M-NEXT: mulhu a3, a0, a5 +; RV32I-M-NEXT: mul a1, a1, a5 +; RV32I-M-NEXT: add a2, a3, a2 +; RV32I-M-NEXT: add a3, a2, a1 +; RV32I-M-NEXT: mul a2, a0, a5 +; RV32I-M-NEXT: .LBB1_2: # %entry +; RV32I-M-NEXT: mv a0, a2 +; RV32I-M-NEXT: mv a1, a3 +; RV32I-M-NEXT: ret +; +; RV64I-M-LABEL: select_example_mul_i64: +; RV64I-M: # %bb.0: # %entry +; RV64I-M-NEXT: beqz a2, .LBB1_2 +; RV64I-M-NEXT: # %bb.1: +; RV64I-M-NEXT: mul a1, a0, a3 +; RV64I-M-NEXT: .LBB1_2: # %entry +; RV64I-M-NEXT: mv a0, a1 +; RV64I-M-NEXT: ret +; +; RV32I-SFB-M-LABEL: select_example_mul_i64: +; RV32I-SFB-M: # %bb.0: # %entry +; RV32I-SFB-M-NEXT: mul a6, a0, a6 +; RV32I-SFB-M-NEXT: mulhu a7, a0, a5 +; RV32I-SFB-M-NEXT: mul a1, a1, a5 +; RV32I-SFB-M-NEXT: mul a0, a0, a5 +; RV32I-SFB-M-NEXT: add a6, a7, a6 +; RV32I-SFB-M-NEXT: beqz a4, .LBB1_2 +; RV32I-SFB-M-NEXT: # %bb.1: # %entry +; RV32I-SFB-M-NEXT: add a3, a6, a1 +; RV32I-SFB-M-NEXT: .LBB1_2: # %entry +; RV32I-SFB-M-NEXT: bnez a4, .LBB1_4 +; RV32I-SFB-M-NEXT: # %bb.3: # %entry +; RV32I-SFB-M-NEXT: mv a0, a2 +; RV32I-SFB-M-NEXT: .LBB1_4: # %entry +; RV32I-SFB-M-NEXT: mv a1, a3 +; RV32I-SFB-M-NEXT: ret +; +; RV64I-SFB-M-LABEL: select_example_mul_i64: +; RV64I-SFB-M: # %bb.0: # %entry +; RV64I-SFB-M-NEXT: mul a0, a0, a3 +; RV64I-SFB-M-NEXT: bnez a2, .LBB1_2 +; RV64I-SFB-M-NEXT: # %bb.1: # %entry +; RV64I-SFB-M-NEXT: mv a0, a1 +; RV64I-SFB-M-NEXT: .LBB1_2: # %entry +; RV64I-SFB-M-NEXT: ret +; +; RV32I-SFBIMul-M-LABEL: select_example_mul_i64: +; RV32I-SFBIMul-M: # %bb.0: # %entry +; RV32I-SFBIMul-M-NEXT: mul a6, a0, a6 +; RV32I-SFBIMul-M-NEXT: mulhu a7, a0, a5 +; RV32I-SFBIMul-M-NEXT: mul a1, a1, a5 +; RV32I-SFBIMul-M-NEXT: add a6, a7, a6 +; RV32I-SFBIMul-M-NEXT: beqz a4, .LBB1_2 +; RV32I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV32I-SFBIMul-M-NEXT: add a3, a6, a1 +; RV32I-SFBIMul-M-NEXT: .LBB1_2: # %entry +; RV32I-SFBIMul-M-NEXT: beqz a4, .LBB1_4 +; RV32I-SFBIMul-M-NEXT: # %bb.3: # %entry +; RV32I-SFBIMul-M-NEXT: mul a2, a0, a5 +; RV32I-SFBIMul-M-NEXT: .LBB1_4: # %entry +; RV32I-SFBIMul-M-NEXT: mv a0, a2 +; RV32I-SFBIMul-M-NEXT: mv a1, a3 +; RV32I-SFBIMul-M-NEXT: ret +; +; RV64I-SFBIMul-M-LABEL: select_example_mul_i64: +; RV64I-SFBIMul-M: # %bb.0: # %entry +; RV64I-SFBIMul-M-NEXT: beqz a2, .LBB1_2 +; RV64I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV64I-SFBIMul-M-NEXT: mul a1, a0, a3 +; RV64I-SFBIMul-M-NEXT: .LBB1_2: # %entry +; RV64I-SFBIMul-M-NEXT: mv a0, a1 +; RV64I-SFBIMul-M-NEXT: ret +entry: + %res = mul i64 %a, %y + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} + From 25a592cc63f0a252b8ae980271d0767a342a0a77 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi <srinivasar@nvidia.com> Date: Tue, 4 Nov 2025 12:50:58 +0530 Subject: [PATCH 129/313] [MLIR][NVVM] Update redux.sync op (#166125) This change: - Updates the `redux.sync` NVVM Op input and output type constraints. - Adds a verifier for the Op to prevent stack dumps and hitting `llvm_unreachable` in certain invalid usage scenarios. Instead, we gracefully error out with an informative message now. --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 8 +-- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 37 ++++++++++++++ .../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 3 -- .../LLVMIR/nvvm/redux-sync-invalid.mlir | 49 +++++++++++++++++++ 4 files changed, 91 insertions(+), 6 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 46fdf5441bc13..9be108d5d1056 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -476,9 +476,9 @@ def ReduxKind : I32EnumAttr<"ReduxKind", "NVVM redux kind", def ReduxKindAttr : EnumAttr<NVVM_Dialect, ReduxKind, "redux_kind">; def NVVM_ReduxOp : - NVVM_Op<"redux.sync", [NVVMRequiresSM<80>]>, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_Type:$val, + NVVM_Op<"redux.sync", [NVVMRequiresSM<80>, AllTypesMatch<["res", "val"]>]>, + Results<(outs AnyTypeOf<[I32, F32]>:$res)>, + Arguments<(ins AnyTypeOf<[I32, F32]>:$val, ReduxKindAttr:$kind, I32:$mask_and_clamp, DefaultValuedAttr<BoolAttr, "false">:$abs, @@ -496,6 +496,8 @@ def NVVM_ReduxOp : [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync) }]; + let hasVerifier = 1; + string llvmBuilder = [{ auto intId = getReduxIntrinsicId($_resultType, $kind, $abs, $nan); $res = createIntrinsicCall(builder, intId, {$val, $mask_and_clamp}); diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 12c81629d7e76..f2e55f255ceac 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -1630,6 +1630,43 @@ LogicalResult NVVM::ClusterLaunchControlQueryCancelOp::verify() { return success(); } +LogicalResult NVVM::ReduxOp::verify() { + mlir::Type reduxType = getType(); + + if (!reduxType.isF32()) { + if (getAbs()) + return emitOpError("abs attribute is supported only for f32 type"); + if (getNan()) + return emitOpError("nan attribute is supported only for f32 type"); + } + + NVVM::ReduxKind kind = getKind(); + switch (kind) { + case NVVM::ReduxKind::ADD: + case NVVM::ReduxKind::AND: + case NVVM::ReduxKind::OR: + case NVVM::ReduxKind::XOR: + case NVVM::ReduxKind::MAX: + case NVVM::ReduxKind::MIN: + case NVVM::ReduxKind::UMAX: + case NVVM::ReduxKind::UMIN: + if (!reduxType.isInteger(32)) + return emitOpError("'") + << stringifyEnum(kind) << "' redux kind unsupported with " + << reduxType << " type. Only supported type is 'i32'."; + break; + case NVVM::ReduxKind::FMIN: + case NVVM::ReduxKind::FMAX: + if (!reduxType.isF32()) + return emitOpError("'") + << stringifyEnum(kind) << "' redux kind unsupported with " + << reduxType << " type. Only supported type is 'f32'."; + break; + } + + return success(); +} + /// Packs the given `field` into the `result`. /// The `result` is 64-bits and each `field` can be 32-bits or narrower. static llvm::Value * diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index 3d86b09b32538..0964e1b8c5ef3 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -36,9 +36,6 @@ using mlir::LLVM::detail::createIntrinsicCall; static llvm::Intrinsic::ID getReduxIntrinsicId(llvm::Type *resultType, NVVM::ReduxKind kind, bool hasAbs, bool hasNaN) { - if (!(resultType->isIntegerTy(32) || resultType->isFloatTy())) - llvm_unreachable("unsupported data type for redux"); - switch (kind) { case NVVM::ReduxKind::ADD: return llvm::Intrinsic::nvvm_redux_sync_add; diff --git a/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir new file mode 100644 index 0000000000000..a8a743006fbf8 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir @@ -0,0 +1,49 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +// ----- + +llvm.func @redux_sync_i32_with_abs(%value: i32, %offset: i32) { + // expected-error@+1 {{abs attribute is supported only for f32 type}} + %res = nvvm.redux.sync add %value, %offset {abs = true}: i32 -> i32 + llvm.return +} + +// ----- + +llvm.func @redux_sync_i32_with_nan(%value: i32, %offset: i32) { + // expected-error@+1 {{nan attribute is supported only for f32 type}} + %res = nvvm.redux.sync add %value, %offset {nan = true}: i32 -> i32 + llvm.return +} + +// ----- + +llvm.func @redux_sync_f32_with_invalid_kind_add(%value: f32, %offset: i32) { + // expected-error@+1 {{'add' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}} + %res = nvvm.redux.sync add %value, %offset: f32 -> f32 + llvm.return +} + +// ----- + +llvm.func @redux_sync_f32_with_invalid_kind_and(%value: f32, %offset: i32) { + // expected-error@+1 {{'and' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}} + %res = nvvm.redux.sync and %value, %offset: f32 -> f32 + llvm.return +} + +// ----- + +llvm.func @redux_sync_i32_with_invalid_kind_fmin(%value: i32, %offset: i32) { + // expected-error@+1 {{'fmin' redux kind unsupported with 'i32' type. Only supported type is 'f32'.}} + %res = nvvm.redux.sync fmin %value, %offset: i32 -> i32 + llvm.return +} + +// ----- + +llvm.func @redux_sync_non_matching_types(%value: i32, %offset: i32) { + // expected-error@+1 {{failed to verify that all of {res, val} have same type}} + %res = nvvm.redux.sync add %value, %offset: i32 -> f32 + llvm.return +} From 6ad25c5912fcf13b44fcc03bd6a66dc33348cd68 Mon Sep 17 00:00:00 2001 From: David Green <david.green@arm.com> Date: Tue, 4 Nov 2025 07:50:51 +0000 Subject: [PATCH 130/313] [AArch64] Improve the cost model for extending mull (#125651) We already have cost model code for detecting extending mull multiplies for the form `mul(ext, ext)`. Since it was added the codegen for mull has been improved, this attempts to catch the cost model up. The main idea is to incorporate extends of larger sizes. A vector `v8i32 mul(zext(v8i8), zext(v8i8))` will be code-generated as `zext (v8i16 mul(zext(v8i8), zext(v8i8))`, or umull+ushll+ushll2. So the total cost should be 3ish if each instruction costs 1. Where exactly we attribute the costs is dependable, this patch opts to sets the cost of the extend to 0 (or the cost of the extend not included in the mull) and the mul gets the cost of the mull+extra extends. isWideningInstruction is split into two functions for the two types of operands it supports. isSingleExtWideningInstruction now handles addw instructions that extend the second operand, isBinExtWideningInstruction is for instructions like addl that extend both operands. --- .../AArch64/AArch64TargetTransformInfo.cpp | 158 ++++++-- .../AArch64/AArch64TargetTransformInfo.h | 14 +- .../CostModel/AArch64/arith-widening.ll | 252 ++++++------- .../AArch64/fully-unrolled-cost.ll | 2 +- .../partial-reduce-dot-product-mixed.ll | 116 +++--- .../partial-reduce-dot-product-neon.ll | 16 +- .../AArch64/partial-reduce-dot-product.ll | 351 +++++++----------- .../SLPVectorizer/AArch64/vecreduceadd.ll | 4 +- 8 files changed, 460 insertions(+), 453 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 5b5565afd62b1..10f2c80edc1b3 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3007,9 +3007,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { llvm_unreachable("Unsupported register kind"); } -bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy) const { +bool AArch64TTIImpl::isSingleExtWideningInstruction( + unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { // A helper that returns a vector type from the given type. The number of // elements in type Ty determines the vector width. auto toVectorTy = [&](Type *ArgTy) { @@ -3027,48 +3027,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) return false; - // Determine if the operation has a widening variant. We consider both the - // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the - // instructions. - // - // TODO: Add additional widening operations (e.g., shl, etc.) once we - // verify that their extending operands are eliminated during code - // generation. Type *SrcTy = SrcOverrideTy; switch (Opcode) { - case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). - case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + case Instruction::Add: // UADDW(2), SADDW(2). + case Instruction::Sub: { // USUBW(2), SSUBW(2). // The second operand needs to be an extend if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); - } else + break; + } + + if (Opcode == Instruction::Sub) return false; - break; - case Instruction::Mul: { // SMULL(2), UMULL(2) - // Both operands need to be extends of the same type. - if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || - (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + + // UADDW(2), SADDW(2) can be commutted. + if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); - } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { - // If one of the operands is a Zext and the other has enough zero bits to - // be treated as unsigned, we can still general a umull, meaning the zext - // is free. - KnownBits Known = - computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); - if (Args[0]->getType()->getScalarSizeInBits() - - Known.Zero.countLeadingOnes() > - DstTy->getScalarSizeInBits() / 2) - return false; - if (!SrcTy) - SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), - DstTy->getScalarSizeInBits() / 2)); - } else - return false; - break; + break; + } + return false; } default: return false; @@ -3099,6 +3080,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; } +Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { + if (Opcode != Instruction::Add && Opcode != Instruction::Sub && + Opcode != Instruction::Mul) + return nullptr; + + // Exit early if DstTy is not a vector type whose elements are one of [i16, + // i32, i64]. SVE doesn't generally have the same set of instructions to + // perform an extend with the add/sub/mul. There are SMULLB style + // instructions, but they operate on top/bottom, requiring some sort of lane + // interleaving to be used with zext/sext. + unsigned DstEltSize = DstTy->getScalarSizeInBits(); + if (!useNeonVector(DstTy) || Args.size() != 2 || + (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) + return nullptr; + + auto getScalarSizeWithOverride = [&](const Value *V) { + if (SrcOverrideTy) + return SrcOverrideTy->getScalarSizeInBits(); + return cast<Instruction>(V) + ->getOperand(0) + ->getType() + ->getScalarSizeInBits(); + }; + + unsigned MaxEltSize = 0; + if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || + (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + MaxEltSize = std::max(EltSize0, EltSize1); + } else if (isa<SExtInst, ZExtInst>(Args[0]) && + isa<SExtInst, ZExtInst>(Args[1])) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + // mul(sext, zext) will become smull(sext, zext) if the extends are large + // enough. + if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2) + return nullptr; + MaxEltSize = DstEltSize / 2; + } else if (Opcode == Instruction::Mul && + (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) { + // If one of the operands is a Zext and the other has enough zero bits + // to be treated as unsigned, we can still generate a umull, meaning the + // zext is free. + KnownBits Known = + computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); + if (Args[0]->getType()->getScalarSizeInBits() - + Known.Zero.countLeadingOnes() > + DstTy->getScalarSizeInBits() / 2) + return nullptr; + + MaxEltSize = + getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]); + } else + return nullptr; + + if (MaxEltSize * 2 > DstEltSize) + return nullptr; + + Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2); + if (ExtTy->getPrimitiveSizeInBits() <= 64) + return nullptr; + return ExtTy; +} + // s/urhadd instructions implement the following pattern, making the // extends free: // %x = add ((zext i8 -> i16), 1) @@ -3159,7 +3207,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (I && I->hasOneUser()) { auto *SingleUser = cast<Instruction>(*I->user_begin()); SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); - if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { + if (Type *ExtTy = isBinExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { + // The cost from Src->Src*2 needs to be added if required, the cost from + // Src*2->ExtTy is free. + if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) { + Type *DoubleSrcTy = + Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2); + return getCastInstrCost(Opcode, DoubleSrcTy, Src, + TTI::CastContextHint::None, CostKind); + } + + return 0; + } + + if (isSingleExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { // For adds only count the second operand as free if both operands are // extends but not the same operation. (i.e both operands are not free in // add(sext, zext)). @@ -3168,8 +3233,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, (isa<CastInst>(SingleUser->getOperand(1)) && cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) return 0; - } else // Others are free so long as isWideningInstruction returned true. + } else { + // Others are free so long as isSingleExtWideningInstruction + // returned true. return 0; + } } // The cast will be free for the s/urhadd instructions @@ -4148,6 +4216,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( })) return *PromotedCost; + // If the operation is a widening instruction (smull or umull) and both + // operands are extends the cost can be cheaper by considering that the + // operation will operate on the narrowest type size possible (double the + // largest input size) and a further extend. + if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) { + if (ExtTy != Ty) + return getArithmeticInstrCost(Opcode, ExtTy, CostKind) + + getCastInstrCost(Instruction::ZExt, Ty, ExtTy, + TTI::CastContextHint::None, CostKind); + return LT.first; + } + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4381,10 +4461,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // - two 2-cost i64 inserts, and // - two 1-cost muls. // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with - // LT.first = 2 the cost is 28. If both operands are extensions it will not - // need to scalarize so the cost can be cheaper (smull or umull). - // so the cost can be cheaper (smull or umull). - if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) + // LT.first = 2 the cost is 28. + if (LT.second != MVT::v2i64) return LT.first; return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() * (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) + diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index b39546a9a381d..e3b0a1bec53ec 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -59,9 +59,17 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> { VECTOR_LDST_FOUR_ELEMENTS }; - bool isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern + /// where both operands can be treated like extends. Returns the minimal type + /// needed to compute the operation. + Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub operation with a single extend operand, detect a + /// widening addw/subw pattern. + bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; // A helper function called by 'getVectorInstrCost'. // diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll index 7e1588f427be4..76f73e43a2355 100644 --- a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll +++ b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll @@ -325,14 +325,14 @@ define void @extaddv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_16_32 = add <4 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <4 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <4 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <4 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64> @@ -434,24 +434,24 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_8_16 = add <8 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <8 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <8 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <8 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <8 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <8 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32> @@ -464,14 +464,14 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <8 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <8 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <8 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <8 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64> @@ -573,24 +573,24 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = add <16 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <16 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <16 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <16 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <16 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <16 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32> @@ -603,14 +603,14 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <16 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <16 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <16 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <16 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64> @@ -1020,14 +1020,14 @@ define void @extsubv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_16_32 = sub <4 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <4 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <4 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <4 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64> @@ -1129,24 +1129,24 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_8_16 = sub <8 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <8 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <8 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <8 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <8 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <8 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32> @@ -1159,14 +1159,14 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <8 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <8 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <8 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <8 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64> @@ -1268,24 +1268,24 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = sub <16 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <16 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <16 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <16 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <16 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <16 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32> @@ -1298,14 +1298,14 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <16 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <16 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <16 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <16 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64> @@ -1715,14 +1715,14 @@ define void @extmulv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_16_32 = mul <4 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <4 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <4 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <4 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64> @@ -1824,24 +1824,24 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_8_16 = mul <8 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <8 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <8 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <8 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <8 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <8 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32> @@ -1854,14 +1854,14 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <8 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <8 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <8 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <8 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64> @@ -1963,24 +1963,24 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = mul <16 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <16 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <16 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <16 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <16 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <16 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32> @@ -1993,14 +1993,14 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <16 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <16 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <16 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <16 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 199203a9f5cb0..1164778c19070 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -82,7 +82,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 27 +; CHECK: Cost for VF 8: 16 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll index 26e630f969ef3..0ee6b52a2450b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -55,44 +55,36 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: entry: ; CHECK-NOI8MM-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-NOI8MM: vector.ph: -; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 -; CHECK-NOI8MM-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NOI8MM-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 -; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 -; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]] -; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]] -; CHECK-NOI8MM-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]] -; CHECK-NOI8MM-NEXT: [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] -; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]] +; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-NOI8MM-NEXT: [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]] +; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]] -; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) -; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-NOI8MM: scalar.ph: +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] +; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-NOI8MM: for.exit: +; CHECK-NOI8MM-NEXT: ret i32 [[TMP15]] ; entry: br label %for.body @@ -166,44 +158,36 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: entry: ; CHECK-NOI8MM-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-NOI8MM: vector.ph: -; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 -; CHECK-NOI8MM-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NOI8MM-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 -; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 -; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]] -; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]] -; CHECK-NOI8MM-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]] -; CHECK-NOI8MM-NEXT: [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] -; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]] +; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-NOI8MM-NEXT: [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]] +; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]] -; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) -; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-NOI8MM: scalar.ph: +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] +; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-NOI8MM: for.exit: +; CHECK-NOI8MM-NEXT: ret i32 [[TMP15]] ; entry: br label %for.body @@ -292,7 +276,7 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 { ; CHECK-NOI8MM-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-NOI8MM: middle.block: ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]] ; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) @@ -387,7 +371,7 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 { ; CHECK-NOI8MM-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-NOI8MM: middle.block: ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]] ; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index b84763142b686..e74830700776c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -512,17 +512,23 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul <16 x i32> [[TMP12]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 4636c1b63da82..d77ca9875bf01 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -758,132 +758,87 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1) -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP16]], i32 [[TMP22]] -; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = mul nuw i32 [[TMP24]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = sub i32 [[TMP25]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = extractelement <vscale x 8 x i32> [[TMP18]], i32 [[TMP26]] -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP8]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP17]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add <16 x i32> [[TMP9]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP31]] -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nuw i32 [[TMP33]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP35]] -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = extractelement <16 x i32> [[TMP11]], i32 15 +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP13]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]] -; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1) -; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP22]] -; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = mul nuw i32 [[TMP24]], 8 -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 -; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]] -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: ret i32 [[TMP8]] ; entry: br label %for.body @@ -930,7 +885,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 @@ -968,7 +923,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8 @@ -1000,7 +955,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 @@ -1085,7 +1040,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) @@ -1183,7 +1138,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]] ; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) @@ -1253,7 +1208,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP26]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE13]]) ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE10]]) @@ -1350,7 +1305,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) ; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]] @@ -1388,7 +1343,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) ; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]] @@ -1426,7 +1381,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true -; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: br label [[EXIT:%.*]] @@ -1461,82 +1416,66 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP8]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <16 x i32> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i32> [[TMP12]], i32 [[TMP19]] -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[TMP7]], [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP15]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP8]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <16 x i32> [[TMP15]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sub i32 [[TMP28]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP29]] -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15 +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[TMP13]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1561,7 +1500,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]]) ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -1616,7 +1555,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] @@ -1651,7 +1590,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) @@ -1685,7 +1624,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] @@ -1803,7 +1742,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add i32 [[TMP21]], [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -1915,7 +1854,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -1953,7 +1892,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -1968,31 +1907,27 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] ; CHECK-MAXBW: for.ph: ; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0 -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64> -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP9]], [[BROADCAST_SPLAT]] -; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[BROADCAST_SPLAT]] +; CHECK-MAXBW-NEXT: [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]]) +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -2048,7 +1983,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2086,7 +2021,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -2101,31 +2036,27 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] ; CHECK-MAXBW: for.ph: ; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0 -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64> -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]]) +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -2186,7 +2117,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2232,7 +2163,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]] ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) @@ -2274,7 +2205,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP11]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2396,7 +2327,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) @@ -2496,7 +2427,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) @@ -2596,7 +2527,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll index c1a87f0c5f907..577efcbbac012 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll @@ -930,7 +930,7 @@ entry: ; COST-LABEL: Function: mla_v8i8_i32 -; COST: Cost: '-18' +; COST: Cost: '-24' define i32 @mla_v8i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" { ; CHECK-LABEL: @mla_v8i8_i32( ; CHECK-NEXT: entry: @@ -1009,7 +1009,7 @@ entry: ; COST-LABEL: Function: mla_v16i8_i32 -; COST: Cost: '-40' +; COST: Cost: '-52' define i32 @mla_v16i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" { ; CHECK-LABEL: @mla_v16i8_i32( ; CHECK-NEXT: entry: From f74e90961f51c9437461007c89b037be41e4e887 Mon Sep 17 00:00:00 2001 From: Maya Amrami <62667278+amrami@users.noreply.github.com> Date: Tue, 4 Nov 2025 10:36:11 +0200 Subject: [PATCH 131/313] [mlir][memref]: Collapse strided unit dim even if strides are dynamic (#157330) --- mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp | 10 +++++----- mlir/test/Dialect/MemRef/ops.mlir | 7 ++++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index 1c21a2f270da6..e271ac58db327 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -2568,6 +2568,11 @@ computeCollapsedLayoutMap(MemRefType srcType, auto trailingReassocs = ArrayRef<int64_t>(reassoc).drop_front(); auto stride = SaturatedInteger::wrap(resultStrides[resultStrideIndex--]); for (int64_t idx : llvm::reverse(trailingReassocs)) { + // Dimensions of size 1 should be skipped, because their strides are + // meaningless and could have any arbitrary value. + if (srcShape[idx - 1] == 1) + continue; + stride = stride * SaturatedInteger::wrap(srcShape[idx]); // Both source and result stride must have the same static value. In that @@ -2582,11 +2587,6 @@ computeCollapsedLayoutMap(MemRefType srcType, if (strict && (stride.saturated || srcStride.saturated)) return failure(); - // Dimensions of size 1 should be skipped, because their strides are - // meaningless and could have any arbitrary value. - if (srcShape[idx - 1] == 1) - continue; - if (!stride.saturated && !srcStride.saturated && stride != srcStride) return failure(); } diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir index a90c9505a8405..b1db99bb3ad08 100644 --- a/mlir/test/Dialect/MemRef/ops.mlir +++ b/mlir/test/Dialect/MemRef/ops.mlir @@ -440,7 +440,8 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>, %arg4: index, %arg5: index, %arg6: index, - %arg7: memref<4x?x4xf32>) { + %arg7: memref<4x?x4xf32>, + %arg8: memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>>) { // CHECK: memref.collapse_shape {{.*}} {{\[}}[0, 1], [2]] // CHECK-SAME: memref<?x?x?xf32> into memref<?x?xf32> %0 = memref.collapse_shape %arg0 [[0, 1], [2]] : @@ -489,6 +490,10 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>, // CHECK: memref.expand_shape {{.*}} {{\[}}[0, 1], [2], [3, 4]] %4 = memref.expand_shape %arg7 [[0, 1], [2], [3, 4]] output_shape [2, 2, %arg4, 2, 2] : memref<4x?x4xf32> into memref<2x2x?x2x2xf32> + +// CHECK: memref.collapse_shape {{.*}} {{\[}}[0, 1], [2], [3]] +// CHECK-SAME: memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>> into memref<1x18x?xsi8, strided<[?, ?, 1], offset: ?>> + %5 = memref.collapse_shape %arg8 [[0, 1], [2], [3]] : memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>> into memref<1x18x?xsi8, strided<[?, ?, 1], offset: ?>> return } From c02bdd466a1c22221bc6de3b6817945c90979351 Mon Sep 17 00:00:00 2001 From: Robert Imschweiler <robert.imschweiler@amd.com> Date: Tue, 4 Nov 2025 10:22:13 +0100 Subject: [PATCH 132/313] [AMDGPU] Fix handling of FP in cs.chain functions (#161194) In case there is an dynamic alloca / an alloca which is not in the entry block, cs.chain functions do not set up an FP, but are reported to need one. This results in a failed assertion in `SIFrameLowering::emitPrologue()` (Assertion `(!HasFP || FPSaved) && "Needed to save FP but didn't save it anywhere"' failed.) This commit changes `hasFPImpl` so that the need for an SP in a cs.chain function does not directly imply the need for an FP anymore. This LLVM defect was identified via the AMD Fuzzing project. --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 4 +- .../AMDGPU/amdgpu-cs-chain-fp-nosave.ll | 519 ++++++++++++++++++ 2 files changed, 522 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 5c39f7a3d6daa..aa5ea77f17291 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -2170,7 +2170,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || + return (frameTriviallyRequiresSP(MFI) && + !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) || + MFI.isFrameAddressTaken() || MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( MF) || mayReserveScratchForCWSR(MF) || diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll new file mode 100644 index 0000000000000..06150e4277e9a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll @@ -0,0 +1,519 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=GFX942 %s + +; These situations are "special" in that they either have an alloca that is not +; in the entry block or that they have a dynamic alloca. Both situations affect +; prolog/epilog generation. + +declare amdgpu_gfx void @foo() + +define amdgpu_cs_chain void @test_alloca() { +; GFX12-LABEL: test_alloca: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s0, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s0, 0x200 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca: +; GFX942: ; %bb.0: ; %.entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s0, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_add_i32 s32, s0, 0x400 +; GFX942-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NEXT: s_endpgm +.entry: + br label %SW_C + +SW_C: ; preds = %.entry + %v = alloca i32, i32 1, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_alloca_var_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s0, s0, 15 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_b32 s0, s0, -16 +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshl_b32 s0, s0, 5 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s1, s0 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_var_uniform: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: s_add_i32 s0, s0, 15 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_and_b32 s0, s0, -16 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_lshl_b32 s0, s0, 6 +; GFX942-NEXT: s_mov_b32 s1, s32 +; GFX942-NEXT: s_add_i32 s32, s1, s0 +; GFX942-NEXT: scratch_store_dword off, v0, s1 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_alloca_var(i32 %count) { +; GFX12-LABEL: test_alloca_var: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0 +; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_ctz_i32_b32 s2, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12-NEXT: s_bitset0_b32 s1, s2 +; GFX12-NEXT: s_max_u32 s0, s0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_lshl_add_u32 v1, s0, 5, s1 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: v_readfirstlane_b32 s32, v1 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_var: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT: v_and_b32_e32 v1, -16, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT: v_readlane_b32 s4, v1, s3 +; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT: s_max_u32 s2, s2, s4 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b32 s0, s32 +; GFX942-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1 +; GFX942-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NEXT: v_readfirstlane_b32 s32, v1 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call() { +; GFX12-LABEL: test_alloca_and_call: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s2, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s2, 0x200 +; GFX12-NEXT: scratch_store_b32 off, v0, s2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_and_call: +; GFX942: ; %bb.0: ; %.entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s2, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_add_i32 s32, s2, 0x400 +; GFX942-NEXT: scratch_store_dword off, v0, s2 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: s_endpgm +.entry: + br label %SW_C + +SW_C: ; preds = %.entry + %v = alloca i32, i32 1, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + call amdgpu_gfx void @foo() + ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_alloca_and_call_var_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, 15 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_b32 s0, s0, -16 +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshl_b32 s0, s0, 5 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s1, s0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_and_call_var_uniform: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: s_add_i32 s0, s0, 15 +; GFX942-NEXT: s_and_b32 s0, s0, -16 +; GFX942-NEXT: s_lshl_b32 s2, s0, 6 +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b32 s3, s32 +; GFX942-NEXT: s_add_i32 s32, s3, s2 +; GFX942-NEXT: scratch_store_dword off, v0, s3 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + call amdgpu_gfx void @foo() + ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) { +; GFX12-LABEL: test_alloca_and_call_var: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0 +; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_ctz_i32_b32 s2, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12-NEXT: s_bitset0_b32 s1, s2 +; GFX12-NEXT: s_max_u32 s0, s0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_lshl_add_u32 v1, s0, 5, s1 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: v_readfirstlane_b32 s32, v1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_and_call_var: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT: v_and_b32_e32 v1, -16, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT: v_readlane_b32 s4, v1, s3 +; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT: s_max_u32 s2, s2, s4 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s3, s32 +; GFX942-NEXT: v_mov_b32_e32 v1, s3 +; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1 +; GFX942-NEXT: scratch_store_dword off, v0, s3 +; GFX942-NEXT: v_readfirstlane_b32 s32, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + call amdgpu_gfx void @foo() + ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca() { +; GFX12-LABEL: test_call_and_alloca: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_mov_b32 s4, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s4, 0x200 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: scratch_store_b32 off, v0, s4 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_call_and_alloca: +; GFX942: ; %bb.0: ; %.entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s4, s32 +; GFX942-NEXT: s_add_i32 s32, s4, 0x400 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: scratch_store_dword off, v0, s4 +; GFX942-NEXT: s_endpgm +.entry: + br label %SW_C + +SW_C: ; preds = %.entry + %v = alloca i32, i32 1, align 4, addrspace(5) + call amdgpu_gfx void @foo() + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_call_and_alloca_var_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, 15 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_b32 s0, s0, -16 +; GFX12-NEXT: s_mov_b32 s4, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshl_b32 s0, s0, 5 +; GFX12-NEXT: v_mov_b32_e32 v40, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s4, s0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: scratch_store_b32 off, v40, s4 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_call_and_alloca_var_uniform: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: s_add_i32 s0, s0, 15 +; GFX942-NEXT: s_and_b32 s0, s0, -16 +; GFX942-NEXT: s_lshl_b32 s2, s0, 6 +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s4, s32 +; GFX942-NEXT: v_mov_b32_e32 v40, 0 +; GFX942-NEXT: s_add_i32 s32, s4, s2 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: scratch_store_dword off, v40, s4 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + call amdgpu_gfx void @foo() + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) { +; GFX12-LABEL: test_call_and_alloca_var: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT: v_mov_b32_e32 v40, 0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_ctz_i32_b32 s2, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s3, v0, s2 +; GFX12-NEXT: s_bitset0_b32 s1, s2 +; GFX12-NEXT: s_max_u32 s0, s0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_mov_b32 s4, s32 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, s0, 5, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readfirstlane_b32 s32, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: scratch_store_b32 off, v40, s4 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_call_and_alloca_var: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX942-NEXT: v_mov_b32_e32 v40, 0 +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT: v_readlane_b32 s4, v0, s3 +; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT: s_max_u32 s2, s2, s4 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s4, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, s4 +; GFX942-NEXT: v_lshl_add_u32 v0, s2, 6, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s32, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: scratch_store_dword off, v40, s4 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + call amdgpu_gfx void @foo() + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} From bb4ed55acdbc7f48bc978147189e8106e3ea42f8 Mon Sep 17 00:00:00 2001 From: Michael Buch <michaelbuch12@gmail.com> Date: Tue, 4 Nov 2025 09:25:28 +0000 Subject: [PATCH 133/313] [lldb][Runtime] Move VerboseTrapFrameRecognizer into CPPLanguageRuntime (#166157) https://github.com/llvm/llvm-project/pull/165996 is adding a Clang dependency to Target because we're moving some of the functionality of the VerboseTrapFrameRecognizer into libClang. To avoid adding this dependency this patch moves VerboseTrapFrameRecognizer into the CPPLanguageRuntime. Most of the frame recognizers already live in the various runtime plugins. An alternative discussed was to create a common `CLanguageRuntime` whose currently sole responsibility was to register the `VerboseTrapFrameRecognizer` and `AssertStackFrameRecognizer`. The main issue I ran into here was frame recognizers aren't uniqued in the target. Currently this only manifests when re-running a target, which re-triggers all the recognizer registration (added a test with a FIXME for this). If we had a common `CLanguageRuntime` that `CPPLanguageRuntime` and `ObjCLanguageRuntime` inherited from, I didn't find a great way to avoid registering the recognizer multiple times. We can't just call_once on it because we do want the recognisers to be re-registered for new targets in the same debugger session. If the recognisers were stored in something like a UniqueVector in the Target, then we wouldn't have that issue. But currently that's not the case, and it would take a bit of refactoring to de-dupe the recognisers. There may very well be solutions I haven't considered, but all the things I've tried so far I wasn't very happy with. So in the end I just moved this to the C++ runtime for now in order to unblock https://github.com/llvm/llvm-project/pull/165996. The C++ language runtime is always available (even for C targets) if the C++ language plugin is available. Which it should also be unless someone is using an LLDB with the C++ plugin compiled out. But at that point numerous things wouldn't work when even debugging just C. --- .../LanguageRuntime/CPlusPlus/CMakeLists.txt | 1 + .../CPlusPlus/CPPLanguageRuntime.cpp | 6 ++- .../CPlusPlus}/VerboseTrapFrameRecognizer.cpp | 2 +- .../CPlusPlus}/VerboseTrapFrameRecognizer.h | 14 +++-- lldb/source/Target/CMakeLists.txt | 1 - lldb/source/Target/Process.cpp | 2 - .../Shell/Recognizer/Inputs/verbose_trap.m | 4 ++ .../Shell/Recognizer/registration-unique.test | 54 +++++++++++++++++++ .../Shell/Recognizer/verbose_trap-objc.test | 12 +++++ 9 files changed, 88 insertions(+), 8 deletions(-) rename lldb/source/{Target => Plugins/LanguageRuntime/CPlusPlus}/VerboseTrapFrameRecognizer.cpp (98%) rename lldb/{include/lldb/Target => source/Plugins/LanguageRuntime/CPlusPlus}/VerboseTrapFrameRecognizer.h (63%) create mode 100644 lldb/test/Shell/Recognizer/Inputs/verbose_trap.m create mode 100644 lldb/test/Shell/Recognizer/registration-unique.test create mode 100644 lldb/test/Shell/Recognizer/verbose_trap-objc.test diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt index 1717b0a896669..a27bceffe2e3a 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt @@ -1,5 +1,6 @@ add_lldb_library(lldbPluginCPPRuntime CPPLanguageRuntime.cpp + VerboseTrapFrameRecognizer.cpp LINK_LIBS lldbCore diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp index 21a5ebe53073a..913678b629f2f 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp @@ -12,6 +12,7 @@ #include <memory> #include "CPPLanguageRuntime.h" +#include "VerboseTrapFrameRecognizer.h" #include "llvm/ADT/StringRef.h" @@ -107,12 +108,15 @@ class LibCXXFrameRecognizer : public StackFrameRecognizer { CPPLanguageRuntime::CPPLanguageRuntime(Process *process) : LanguageRuntime(process) { - if (process) + if (process) { process->GetTarget().GetFrameRecognizerManager().AddRecognizer( StackFrameRecognizerSP(new LibCXXFrameRecognizer()), {}, std::make_shared<RegularExpression>("^std::__[^:]*::"), /*mangling_preference=*/Mangled::ePreferDemangledWithoutArguments, /*first_instruction_only=*/false); + + RegisterVerboseTrapFrameRecognizer(*process); + } } bool CPPLanguageRuntime::IsAllowedRuntimeValue(ConstString name) { diff --git a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp similarity index 98% rename from lldb/source/Target/VerboseTrapFrameRecognizer.cpp rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp index 03ab58b8c59a9..730aba5b42a3e 100644 --- a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp @@ -1,4 +1,4 @@ -#include "lldb/Target/VerboseTrapFrameRecognizer.h" +#include "VerboseTrapFrameRecognizer.h" #include "lldb/Core/Module.h" #include "lldb/Symbol/Function.h" diff --git a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h similarity index 63% rename from lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h index 7e045760a28be..7d7020f63c8d2 100644 --- a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h @@ -1,5 +1,13 @@ -#ifndef LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H -#define LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H +//===-- VerboseTrapFrameRecognizer.h --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H +#define LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H #include "lldb/Target/StackFrameRecognizer.h" @@ -36,4 +44,4 @@ class VerboseTrapFrameRecognizer : public StackFrameRecognizer { } // namespace lldb_private -#endif // LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H +#endif // LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt index b7788e80eecac..8e6d51efad1f3 100644 --- a/lldb/source/Target/CMakeLists.txt +++ b/lldb/source/Target/CMakeLists.txt @@ -80,7 +80,6 @@ add_lldb_library(lldbTarget UnixSignals.cpp UnwindAssembly.cpp UnwindLLDB.cpp - VerboseTrapFrameRecognizer.cpp ADDITIONAL_HEADER_DIRS ${LLDB_INCLUDE_DIR}/lldb/Target diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index fb9e7eb5ed1bd..42ce198a283da 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -65,7 +65,6 @@ #include "lldb/Target/ThreadPlanCallFunction.h" #include "lldb/Target/ThreadPlanStack.h" #include "lldb/Target/UnixSignals.h" -#include "lldb/Target/VerboseTrapFrameRecognizer.h" #include "lldb/Utility/AddressableBits.h" #include "lldb/Utility/Event.h" #include "lldb/Utility/LLDBLog.h" @@ -513,7 +512,6 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp, // We should have a plugin do the registration instead, for example, a // common C LanguageRuntime plugin. RegisterAssertFrameRecognizer(this); - RegisterVerboseTrapFrameRecognizer(*this); } Process::~Process() { diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m new file mode 100644 index 0000000000000..83a829a8c2fdd --- /dev/null +++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m @@ -0,0 +1,4 @@ +int main() { + __builtin_verbose_trap("Foo", "Bar"); + return 0; +} diff --git a/lldb/test/Shell/Recognizer/registration-unique.test b/lldb/test/Shell/Recognizer/registration-unique.test new file mode 100644 index 0000000000000..f6052bb57c940 --- /dev/null +++ b/lldb/test/Shell/Recognizer/registration-unique.test @@ -0,0 +1,54 @@ +# Checks that the recognizers that should work across language runtimes +# are only registered once with the target. + +# RUN: split-file %s %t + +# RUN: %clang_host %t/main.cpp -g -o %t/cpp.out +# RUN: %lldb -b -s %t/commands.input %t/cpp.out | FileCheck %s + +# RUN: %clang_host -x objective-c++ %t/main.mm -g -o %t/objcxx.out +# RUN: %lldb -b -s %t/commands.input %t/objcxx.out | FileCheck %s + +# RUN: %clang_host %t/main.c -g -o %t/c.out +# RUN: %lldb -b -s %t/commands.input %t/c.out | FileCheck %s + +# RUN: %clang_host -x objective-c %t/main.m -g -o %t/objc.out +# RUN: %lldb -b -s %t/commands.input %t/objc.out | FileCheck %s + +#--- main.m +int main() {} + +#--- main.c +int main() {} + +#--- main.mm +int main() {} + +#--- main.cpp +int main() {} + +#--- commands.input + +b main +frame recognizer list +run +frame recognizer list +continue +run +frame recognizer list + +# CHECK: frame recognizer list +# CHECK-NEXT: no matching results found. + +# CHECK: frame recognizer list +# CHECK: Verbose Trap StackFrame Recognizer +# CHECK: Assert StackFrame Recognizer +# CHECK-NOT: Verbose Trap StackFrame Recognizer +# CHECK-NOT: Assert StackFrame Recognizer + +# FIXME: avoid duplicate frame recognizers in the target: https://github.com/llvm/llvm-project/issues/166341 +# CHECK: frame recognizer list +# CHECK: Verbose Trap StackFrame Recognizer +# CHECK: Assert StackFrame Recognizer +# CHECK: Verbose Trap StackFrame Recognizer +# CHECK: Assert StackFrame Recognizer diff --git a/lldb/test/Shell/Recognizer/verbose_trap-objc.test b/lldb/test/Shell/Recognizer/verbose_trap-objc.test new file mode 100644 index 0000000000000..789e79c9542c5 --- /dev/null +++ b/lldb/test/Shell/Recognizer/verbose_trap-objc.test @@ -0,0 +1,12 @@ +# REQUIRES: system-darwin +# +# RUN: %clangxx_host -x objective-c -g %S/Inputs/verbose_trap.m -o %t.out +# RUN: %lldb -b -s %s %t.out | FileCheck %s + +run +# CHECK: thread #{{.*}}stop reason = Foo: Bar +frame info +# CHECK: frame #{{.*}}`main at verbose_trap.m +frame recognizer info 0 +# CHECK: frame 0 is recognized by Verbose Trap StackFrame Recognizer +q From 9ee9fb0d4090ffb3799ddb731394e403dbda2fb3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <llvm-dev@redking.me.uk> Date: Tue, 4 Nov 2025 09:37:05 +0000 Subject: [PATCH 134/313] [X86] narrowBitOpRMW - add handling for single bit insertion patterns (REAPPLIED) (#166337) Insertion of a single bit into a large integer is typically canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can be simplified to modify the i32 block as a BTR followed by an OR((i32)InsertBit << (ShAmt % 32). We must ensure that the InsertBit is zero apart from the LSB so we can cheaply truncate it to work with the i32 block like the simpler BT patterns. REAPPLIED from #165742 which was reverted as part of a chain of commits due to a sanitizer regression that should have been fixed by #166160 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 43 +- llvm/test/CodeGen/X86/bittest-big-integer.ll | 952 ++----------------- 2 files changed, 114 insertions(+), 881 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b97b5089cb0a3..d4a4d4339f7e1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53349,7 +53349,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, } // Look for a RMW operation that only touches one bit of a larger than legal -// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value. +// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single +// i32 sub value. static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -53375,14 +53376,20 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, // BTR: X & ~(1 << ShAmt) // BTS: X | (1 << ShAmt) // BTC: X ^ (1 << ShAmt) - SDValue ShAmt; + // + // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) + SDValue InsertBit, ShAmt; if (!StoredVal.hasOneUse() || !(sd_match(StoredVal, m_And(m_Specific(LoadVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))))) + m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match(StoredVal, + m_Or(m_And(m_Specific(LoadVal), + m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) return SDValue(); // Ensure the shift amount is in bounds. @@ -53390,6 +53397,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) return SDValue(); + // If we're inserting a bit then it must be the LSB. + if (InsertBit) { + KnownBits KnownInsert = DAG.computeKnownBits(InsertBit); + if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1)) + return SDValue(); + } + // Split the shift into an alignment shift that moves the active i32 block to // the bottom bits for truncation and a modulo shift that can act on the i32. EVT AmtVT = ShAmt.getValueType(); @@ -53397,6 +53411,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, DAG.getSignedConstant(-32LL, DL, AmtVT)); SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8); // Compute the byte offset for the i32 block that is changed by the RMW. // combineTruncate will adjust the load for us in a similar way. @@ -53411,13 +53426,23 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - SDValue Mask = - DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), - DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); - if (StoredVal.getOpcode() == ISD::AND) - Mask = DAG.getNOT(DL, Mask, MVT::i32); + SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(1, DL, MVT::i32), ModuloAmt); + + SDValue Res; + if (InsertBit) { + SDValue BitMask = + DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt); + Res = + DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32)); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask); + } else { + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + } - SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), Align(), St->getMemOperand()->getFlags()); } diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index c311ab869c311..87a54a0b9148d 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -356,41 +356,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB9_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl $0, %edx -; X86-NEXT: .LBB9_2: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: notl %esi -; X86-NEXT: notl %edx -; X86-NEXT: je .LBB9_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB9_4: -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: andl (%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $32, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%ebx,%eax), %eax -; X86-NEXT: btl %ecx, %eax +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: movl %edx, (%ebx) -; X86-NEXT: movl %esi, 4(%ebx) +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -600,208 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $96, %esp -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movzbl 16(%ebp), %ebx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 64(%esp,%eax), %edx -; X86-NEXT: movl 68(%esp,%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: movl 72(%esp,%esi), %ebx -; X86-NEXT: movl 76(%esp,%esi), %esi -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: notl %edi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: movl 36(%esp,%ecx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esp,%ecx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: andl 8(%eax), %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: notl %esi -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl 44(%esp,%eax), %eax -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: andl 12(%ecx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: notl %ebx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl 32(%esp,%eax), %edx -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: andl (%eax), %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: andl 4(%ecx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl 12(%ebp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: andl $96, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%ecx,%eax), %eax -; X86-NEXT: btl %esi, %eax -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %esi -; SSE-NEXT: xorl %r8d, %r8d -; SSE-NEXT: shldq %cl, %rsi, %r8 -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %r9d, %r9d -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rsi, %r8 -; SSE-NEXT: cmovneq %r9, %rsi -; SSE-NEXT: notq %r8 -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %r9, %rax -; SSE-NEXT: notq %rsi -; SSE-NEXT: andq 8(%rdi), %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq (%rdi), %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: andl $96, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: btl %ecx, %eax +; SSE-NEXT: andl $96, %esi +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d ; SSE-NEXT: setae %al -; SSE-NEXT: movq %rsi, (%rdi) -; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: init_eq_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %rax, %rsi -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: xorl %r8d, %r8d -; AVX2-NEXT: shldq %cl, %rdx, %r8 -; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: shlxq %rcx, %rax, %rax -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rax, %rsi -; AVX2-NEXT: cmovneq %r9, %rax -; AVX2-NEXT: shlxq %rcx, %rdx, %rdx -; AVX2-NEXT: cmovneq %rdx, %r8 -; AVX2-NEXT: cmovneq %r9, %rdx -; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: andnq (%rdi), %rax, %r8 -; AVX2-NEXT: orq %rdx, %r8 -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $96, %eax -; AVX2-NEXT: shrl $3, %eax -; AVX2-NEXT: movl (%rdi,%rax), %eax -; AVX2-NEXT: btl %ecx, %eax -; AVX2-NEXT: setae %al -; AVX2-NEXT: movq %r8, (%rdi) -; AVX2-NEXT: movq %rsi, 8(%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %rax, %rsi -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: shlxq %rcx, %rax, %rax -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: xorl %r9d, %r9d -; AVX512-NEXT: shldq %cl, %rdx, %r9 -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rax, %rsi -; AVX512-NEXT: cmovneq %r8, %rax -; AVX512-NEXT: shlxq %rcx, %rdx, %rdx -; AVX512-NEXT: cmovneq %rdx, %r9 -; AVX512-NEXT: cmovneq %r8, %rdx -; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX512-NEXT: orq %r9, %rsi -; AVX512-NEXT: andnq (%rdi), %rax, %r8 -; AVX512-NEXT: orq %rdx, %r8 -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: andl $96, %eax -; AVX512-NEXT: shrl $3, %eax -; AVX512-NEXT: movl (%rdi,%rax), %eax -; AVX512-NEXT: btl %ecx, %eax -; AVX512-NEXT: setae %al -; AVX512-NEXT: movq %r8, (%rdi) -; AVX512-NEXT: movq %rsi, 8(%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: andl $96, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -977,673 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $352, %esp # imm = 0x160 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%eax), %edi -; X86-NEXT: movl 44(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl 16(%ebp), %eax -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 60(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 56(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 48(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 52(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 44(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 48(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 40(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 44(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 36(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 40(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 32(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 36(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 28(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 32(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 24(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 28(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 20(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 24(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 16(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 20(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 12(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 16(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 8(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 12(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 4(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 8(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: andl 4(%edx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%edx,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 60(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 56(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 52(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl %ebx, 8(%edx) -; X86-NEXT: movl %edi, 4(%edx) -; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i512: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: # kill: def $eax killed $eax killed $rax -; SSE-NEXT: andl $56, %eax -; SSE-NEXT: negl %eax -; SSE-NEXT: movslq %eax, %r12 -; SSE-NEXT: movq 136(%rsp,%r12), %r9 -; SSE-NEXT: movq 144(%rsp,%r12), %rax -; SSE-NEXT: movq %rax, %rsi -; SSE-NEXT: shldq %cl, %r9, %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 152(%rsp,%r12), %r11 -; SSE-NEXT: shldq %cl, %rax, %r11 -; SSE-NEXT: movq 120(%rsp,%r12), %r10 -; SSE-NEXT: movq 128(%rsp,%r12), %rax -; SSE-NEXT: movq %rax, %rbx -; SSE-NEXT: shldq %cl, %r10, %rbx -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: movq 104(%rsp,%r12), %r14 -; SSE-NEXT: movq 112(%rsp,%r12), %rax -; SSE-NEXT: movq %rax, %r15 -; SSE-NEXT: shldq %cl, %r14, %r15 -; SSE-NEXT: shldq %cl, %rax, %r10 -; SSE-NEXT: movq 96(%rsp,%r12), %rax -; SSE-NEXT: movq %rax, %r13 -; SSE-NEXT: shlq %cl, %r13 -; SSE-NEXT: shldq %cl, %rax, %r14 -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq 8(%rsp,%r12), %r8 -; SSE-NEXT: movq 16(%rsp,%r12), %rsi -; SSE-NEXT: movq %rsi, %rbp -; SSE-NEXT: shldq %cl, %r8, %rbp -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: notq %rax -; SSE-NEXT: andq 48(%rdi), %rax -; SSE-NEXT: orq %rbp, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: notq %rbx -; SSE-NEXT: notq %r11 -; SSE-NEXT: movq 24(%rsp,%r12), %rax -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq -8(%rsp,%r12), %rbp -; SSE-NEXT: movq (%rsp,%r12), %rdx -; SSE-NEXT: movq %rdx, %rsi -; SSE-NEXT: shldq %cl, %rbp, %rsi -; SSE-NEXT: andq 56(%rdi), %r11 -; SSE-NEXT: andq 32(%rdi), %rbx -; SSE-NEXT: orq %rax, %r11 -; SSE-NEXT: orq %rsi, %rbx -; SSE-NEXT: notq %r15 -; SSE-NEXT: shldq %cl, %rdx, %r8 -; SSE-NEXT: notq %r9 -; SSE-NEXT: andq 40(%rdi), %r9 -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: movq -24(%rsp,%r12), %rax -; SSE-NEXT: movq -16(%rsp,%r12), %rdx -; SSE-NEXT: movq %rdx, %rsi -; SSE-NEXT: shldq %cl, %rax, %rsi -; SSE-NEXT: andq 16(%rdi), %r15 -; SSE-NEXT: orq %rsi, %r15 -; SSE-NEXT: shldq %cl, %rdx, %rbp -; SSE-NEXT: notq %r10 -; SSE-NEXT: notq %r13 -; SSE-NEXT: movq -32(%rsp,%r12), %rdx -; SSE-NEXT: movq %rdx, %rsi -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: andq 24(%rdi), %r10 -; SSE-NEXT: andq (%rdi), %r13 -; SSE-NEXT: orq %rbp, %r10 -; SSE-NEXT: orq %rsi, %r13 -; SSE-NEXT: notq %r14 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: andq 8(%rdi), %r14 -; SSE-NEXT: orq %rax, %r14 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andl $60, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; SSE-NEXT: btl %ecx, %eax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq %rax, 48(%rdi) -; SSE-NEXT: movq %r11, 56(%rdi) -; SSE-NEXT: movq %rbx, 32(%rdi) -; SSE-NEXT: movq %r9, 40(%rdi) -; SSE-NEXT: movq %r15, 16(%rdi) -; SSE-NEXT: movq %r10, 24(%rdi) -; SSE-NEXT: movq %r13, (%rdi) -; SSE-NEXT: movq %r14, 8(%rdi) +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $60, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d ; SSE-NEXT: setae %al -; SSE-NEXT: addq $168, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: init_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $184, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: shrl $3, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: andl $56, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: movslq %eax, %r11 -; AVX2-NEXT: movq 128(%rsp,%r11), %r15 -; AVX2-NEXT: movq 136(%rsp,%r11), %rax -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: shldq %cl, %r15, %rsi -; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 120(%rsp,%r11), %r8 -; AVX2-NEXT: shldq %cl, %r8, %r15 -; AVX2-NEXT: movq 144(%rsp,%r11), %r14 -; AVX2-NEXT: movq 152(%rsp,%r11), %rsi -; AVX2-NEXT: movq %rsi, %r9 -; AVX2-NEXT: shldq %cl, %r14, %r9 -; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rax, %r14 -; AVX2-NEXT: movq 112(%rsp,%r11), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 160(%rsp,%r11), %r13 -; AVX2-NEXT: movq 168(%rsp,%r11), %r12 -; AVX2-NEXT: shldq %cl, %r13, %r12 -; AVX2-NEXT: shldq %cl, %rsi, %r13 -; AVX2-NEXT: shldq %cl, %rax, %r8 -; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 24(%rsp,%r11), %rbp -; AVX2-NEXT: movq 32(%rsp,%r11), %rdx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shldq %cl, %rbp, %rax -; AVX2-NEXT: movq 40(%rsp,%r11), %r10 -; AVX2-NEXT: shldq %cl, %rdx, %r10 -; AVX2-NEXT: movq 8(%rsp,%r11), %r9 -; AVX2-NEXT: movq 16(%rsp,%r11), %rdx -; AVX2-NEXT: movq %rdx, %r8 -; AVX2-NEXT: shldq %cl, %r9, %r8 -; AVX2-NEXT: shldq %cl, %rdx, %rbp -; AVX2-NEXT: andnq 48(%rdi), %r13, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq -8(%rsp,%r11), %rax -; AVX2-NEXT: movq (%rsp,%r11), %rdx -; AVX2-NEXT: movq %rdx, %rsi -; AVX2-NEXT: shldq %cl, %rax, %rsi -; AVX2-NEXT: shldq %cl, %rdx, %r9 -; AVX2-NEXT: andnq 56(%rdi), %r12, %r12 -; AVX2-NEXT: andnq 32(%rdi), %r14, %r14 -; AVX2-NEXT: orq %r10, %r12 -; AVX2-NEXT: orq %r8, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: andnq 40(%rdi), %rdx, %rdx -; AVX2-NEXT: orq %rbp, %rdx -; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT: movq -16(%rsp,%r11), %r10 -; AVX2-NEXT: shlxq %rcx, %r10, %r11 -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %r10, %rax -; AVX2-NEXT: andnq 16(%rdi), %r15, %rcx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andnq 24(%rdi), %r10, %r10 -; AVX2-NEXT: orq %rsi, %rcx -; AVX2-NEXT: orq %r9, %r10 -; AVX2-NEXT: andnq (%rdi), %r8, %rsi -; AVX2-NEXT: orq %r11, %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andnq 8(%rdi), %r8, %r8 -; AVX2-NEXT: orq %rax, %r8 -; AVX2-NEXT: andl $60, %ebx -; AVX2-NEXT: movl (%rdi,%rbx), %eax -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; AVX2-NEXT: btl %r9d, %eax -; AVX2-NEXT: movq %r13, 48(%rdi) -; AVX2-NEXT: movq %r12, 56(%rdi) -; AVX2-NEXT: movq %r14, 32(%rdi) -; AVX2-NEXT: movq %rdx, 40(%rdi) -; AVX2-NEXT: movq %rcx, 16(%rdi) -; AVX2-NEXT: movq %r10, 24(%rdi) -; AVX2-NEXT: movq %rsi, (%rdi) -; AVX2-NEXT: movq %r8, 8(%rdi) -; AVX2-NEXT: setae %al -; AVX2-NEXT: addq $184, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $168, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: movl %esi, %r10d -; AVX512-NEXT: shrl $3, %r10d -; AVX512-NEXT: movl %r10d, %r8d -; AVX512-NEXT: andl $56, %r8d -; AVX512-NEXT: negl %r8d -; AVX512-NEXT: movslq %r8d, %r9 -; AVX512-NEXT: movq 112(%rsp,%r9), %r11 -; AVX512-NEXT: movq 120(%rsp,%r9), %r14 -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: shldq %cl, %r11, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 104(%rsp,%r9), %rax -; AVX512-NEXT: shldq %cl, %rax, %r11 -; AVX512-NEXT: movq 128(%rsp,%r9), %r15 -; AVX512-NEXT: movq 136(%rsp,%r9), %rbp -; AVX512-NEXT: movq %rbp, %rbx -; AVX512-NEXT: shldq %cl, %r15, %rbx -; AVX512-NEXT: shldq %cl, %r14, %r15 -; AVX512-NEXT: movq 144(%rsp,%r9), %r13 -; AVX512-NEXT: movq 152(%rsp,%r9), %r12 -; AVX512-NEXT: shldq %cl, %r13, %r12 -; AVX512-NEXT: movq 96(%rsp,%r9), %r14 -; AVX512-NEXT: shldq %cl, %rbp, %r13 -; AVX512-NEXT: shldq %cl, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 8(%rsp,%r9), %r8 -; AVX512-NEXT: movq 16(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %rbp -; AVX512-NEXT: shldq %cl, %r8, %rbp -; AVX512-NEXT: andnq 48(%rdi), %r13, %r13 -; AVX512-NEXT: orq %rbp, %r13 -; AVX512-NEXT: movq 24(%rsp,%r9), %rbp -; AVX512-NEXT: shldq %cl, %rax, %rbp -; AVX512-NEXT: movq -8(%rsp,%r9), %rax -; AVX512-NEXT: movq (%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %rdx -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: andnq 56(%rdi), %r12, %r12 -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: andnq 32(%rdi), %r15, %r15 -; AVX512-NEXT: orq %rdx, %r15 -; AVX512-NEXT: shldq %cl, %rsi, %r8 -; AVX512-NEXT: movq -24(%rsp,%r9), %rdx -; AVX512-NEXT: movq -16(%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %rbp -; AVX512-NEXT: shldq %cl, %rdx, %rbp -; AVX512-NEXT: andnq 40(%rdi), %rbx, %rbx -; AVX512-NEXT: orq %r8, %rbx -; AVX512-NEXT: andnq 16(%rdi), %r11, %r8 -; AVX512-NEXT: orq %rbp, %r8 -; AVX512-NEXT: shlxq %rcx, %r14, %r11 -; AVX512-NEXT: movq -32(%rsp,%r9), %r9 -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: andnq 24(%rdi), %rsi, %rsi -; AVX512-NEXT: orq %rax, %rsi -; AVX512-NEXT: shlxq %rcx, %r9, %rax -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %r9, %rdx -; AVX512-NEXT: andnq (%rdi), %r11, %rcx -; AVX512-NEXT: orq %rax, %rcx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andnq 8(%rdi), %rax, %rax -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: andl $60, %r10d -; AVX512-NEXT: movl (%rdi,%r10), %edx -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; AVX512-NEXT: btl %r9d, %edx -; AVX512-NEXT: movq %r13, 48(%rdi) -; AVX512-NEXT: movq %r12, 56(%rdi) -; AVX512-NEXT: movq %r15, 32(%rdi) -; AVX512-NEXT: movq %rbx, 40(%rdi) -; AVX512-NEXT: movq %r8, 16(%rdi) -; AVX512-NEXT: movq %rsi, 24(%rdi) -; AVX512-NEXT: movq %rcx, (%rdi) -; AVX512-NEXT: movq %rax, 8(%rdi) -; AVX512-NEXT: setae %al -; AVX512-NEXT: addq $168, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i512: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: andl $60, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs From 84cc2b0ebc2b28e0474c9aa46264f4c20adcc959 Mon Sep 17 00:00:00 2001 From: Tomohiro Kashiwada <kikairoya@gmail.com> Date: Tue, 4 Nov 2025 18:37:58 +0900 Subject: [PATCH 135/313] [LIT] replace `lit.util.mkdir` with `pathlib.Path.mkdir` (#163948) `lit.util.mkdir` and `lit.util.mkdir_p` were written during the Python 2.x era. Since modern `pathlib` functions have similar functionality, we can simply use those instead. If you encounter a path length issue after this change, the registry value `LongPathsEnabled` must be set as described in https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation . Note that the Python runtime is already marked as a `longPathAware` executable. **Background:** On Cygwin, a file named `file_name.exe` can be accessed without the suffix, simply as `file_name`, as shown below: ``` $ echo > file_name.exe $ file file_name.exe file_name.exe: very short file (no magic) $ file file_name file_name: very short file (no magic) ``` In this situation, while running `mkdir file_name` works as intended, checking for the existence of the target before calling `mkdir` incorrectly reports that it already exists and thus skips the directory creation. ``` $ test -e file_name && echo exists exists $ mkdir file_name && echo ok ok $ file file_name file_name: directory ``` Therefore, the existence pre-check should be skipped on Cygwin. Instead of add a workaround, refactored them. --- llvm/utils/lit/lit/TestRunner.py | 15 ++++--- llvm/utils/lit/lit/util.py | 40 +++---------------- .../shtest-glob/example_dir1.input/empty | 0 .../shtest-glob/example_dir2.input/empty | 0 .../tests/Inputs/shtest-glob/glob-mkdir.txt | 5 +++ llvm/utils/lit/tests/shtest-glob.py | 9 +++-- 6 files changed, 22 insertions(+), 47 deletions(-) create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 3176b1a257434..64148c6098327 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -462,16 +462,15 @@ def executeBuiltinMkdir(cmd, cmd_shenv): stderr = StringIO() exitCode = 0 for dir in args: - cwd = cmd_shenv.cwd - dir = to_unicode(dir) if kIsWindows else to_bytes(dir) - cwd = to_unicode(cwd) if kIsWindows else to_bytes(cwd) - if not os.path.isabs(dir): - dir = lit.util.abs_path_preserve_drive(os.path.join(cwd, dir)) + dir = pathlib.Path(dir) + cwd = pathlib.Path(to_unicode(cmd_shenv.cwd)) + if not dir.is_absolute(): + dir = lit.util.abs_path_preserve_drive(cwd / dir) if parent: - lit.util.mkdir_p(dir) + dir.mkdir(parents=True, exist_ok=True) else: try: - lit.util.mkdir(dir) + dir.mkdir(exist_ok=True) except OSError as err: stderr.write("Error: 'mkdir' command failed, %s\n" % str(err)) exitCode = 1 @@ -2411,7 +2410,7 @@ def runOnce( return out, err, exitCode, timeoutInfo, status # Create the output directory if it does not already exist. - lit.util.mkdir_p(os.path.dirname(tmpBase)) + pathlib.Path(tmpBase).parent.mkdir(parents=True, exist_ok=True) # Re-run failed tests up to test.allowed_retries times. execdir = os.path.dirname(test.getExecPath()) diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py index ce4c3c2df3436..e4e031b3e0898 100644 --- a/llvm/utils/lit/lit/util.py +++ b/llvm/utils/lit/lit/util.py @@ -5,6 +5,7 @@ import math import numbers import os +import pathlib import platform import re import signal @@ -131,48 +132,17 @@ def abs_path_preserve_drive(path): # Since Python 3.8, os.path.realpath resolves sustitute drives, # so we should not use it. In Python 3.7, os.path.realpath # was implemented as os.path.abspath. + if isinstance(path, pathlib.Path): + return path.absolute() return os.path.abspath(path) else: # On UNIX, the current directory always has symbolic links resolved, # so any program accepting relative paths cannot preserve symbolic # links in paths and we should always use os.path.realpath. + if isinstance(path, pathlib.Path): + return path.resolve() return os.path.realpath(path) -def mkdir(path): - try: - if platform.system() == "Windows": - from ctypes import windll - from ctypes import GetLastError, WinError - - path = os.path.abspath(path) - # Make sure that the path uses backslashes here, in case - # python would have happened to use forward slashes, as the - # NT path format only supports backslashes. - path = path.replace("/", "\\") - NTPath = to_unicode(r"\\?\%s" % path) - if not windll.kernel32.CreateDirectoryW(NTPath, None): - raise WinError(GetLastError()) - else: - os.mkdir(path) - except OSError: - e = sys.exc_info()[1] - # ignore EEXIST, which may occur during a race condition - if e.errno != errno.EEXIST: - raise - - -def mkdir_p(path): - """mkdir_p(path) - Make the "path" directory, if it does not exist; this - will also make directories for any missing parent directories.""" - if not path or os.path.exists(path): - return - - parent = os.path.dirname(path) - if parent != path: - mkdir_p(parent) - - mkdir(path) - def listdir_files(dirname, suffixes=None, exclude_filenames=None, prefixes=None): """Yields files in a directory. diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt index d1329f5dbfaae..71972411bc4cf 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt @@ -1,2 +1,7 @@ ## Tests glob pattern handling in the mkdir command. + +## This mkdir should fail because the `example_file*.input`s are regular files. # RUN: not mkdir %S/example_file*.input + +## This mkdir should succeed (so RUN should fail) because the `example_dir*.input`s that already exist are directories. +# RUN: not mkdir %S/example_dir*.input diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py index ae90f31907d49..aa4705b634a7d 100644 --- a/llvm/utils/lit/tests/shtest-glob.py +++ b/llvm/utils/lit/tests/shtest-glob.py @@ -1,12 +1,13 @@ ## Tests glob pattern handling in echo command. # RUN: not %{lit} -a -v %{inputs}/shtest-glob \ -# RUN: | FileCheck -dump-input=fail -match-full-lines %s -# +# RUN: | FileCheck -dump-input=fail -match-full-lines --implicit-check-not=Error: %s # END. # CHECK: UNRESOLVED: shtest-glob :: glob-echo.txt ({{[^)]*}}) # CHECK: TypeError: string argument expected, got 'GlobItem' -# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}}) -# CHECK: # error: command failed with exit status: 1 +# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}}) +# CHECK: # | Error: 'mkdir' command failed, {{.*}}example_file1.input' +# CHECK-NEXT: # | Error: 'mkdir' command failed, {{.*}}example_file2.input' +# CHECK: # error: command failed with exit status: 1 From dec6e7380c691af755efe3f80e53590b743c497c Mon Sep 17 00:00:00 2001 From: Tomohiro Kashiwada <kikairoya@gmail.com> Date: Tue, 4 Nov 2025 18:38:20 +0900 Subject: [PATCH 136/313] [LIT][Cygwin] Mark `ulimit -f` unsupported for Cygwin (#165849) Cygwin doesn't support `ulimit -f` because Windows doesn't provide such functionality. --- llvm/utils/lit/tests/shtest-ulimit-nondarwin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py index 286fd3d7e173e..d81cde0159792 100644 --- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py +++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py @@ -2,7 +2,7 @@ # ulimit does not work on non-POSIX platforms. # These tests are specific to options that Darwin does not support. -# UNSUPPORTED: system-windows, system-darwin, system-aix, system-solaris +# UNSUPPORTED: system-windows, system-cygwin, system-darwin, system-aix, system-solaris # RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s From 73e3d788906a6e82394ac41d5c9baf918740bb0c Mon Sep 17 00:00:00 2001 From: David Spickett <david.spickett@linaro.org> Date: Tue, 4 Nov 2025 10:00:33 +0000 Subject: [PATCH 137/313] [flang][test] Fix gcc-triple test when default target is not x86_64 By adding --target to the first command. Test added by #165886. --- flang/test/Driver/gcc-triple.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/test/Driver/gcc-triple.f90 b/flang/test/Driver/gcc-triple.f90 index 027d78a7c5046..324311febf157 100644 --- a/flang/test/Driver/gcc-triple.f90 +++ b/flang/test/Driver/gcc-triple.f90 @@ -2,7 +2,7 @@ !! Test that --gcc-triple option is working as expected. -! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree 2>&1 | FileCheck %s --dump-input=always --check-prefix=DEFAULT_TRIPLE +! RUN: %flang --target=x86_64-linux-gnu -v --sysroot=%S/Inputs/fedora_39_tree 2>&1 | FileCheck %s --dump-input=always --check-prefix=DEFAULT_TRIPLE ! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation: ! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13 ! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation: From 0cae0af520f7f3c60367ea4a2b38a32a35fa6c27 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com> Date: Tue, 4 Nov 2025 10:04:57 +0000 Subject: [PATCH 138/313] [VPlan] Shorten insert-idiom in sinkScalarOperands (NFC) (#166343) To follow-up on a post-commit review. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9d9bb14530539..e8e87aa8a4f3c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -154,27 +154,32 @@ static bool sinkScalarOperands(VPlan &Plan) { bool ScalarVFOnly = Plan.hasScalarVFOnly(); bool Changed = false; - auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo, - VPSingleDefRecipe *Candidate) { - // We only know how to duplicate VPReplicateRecipes and - // VPScalarIVStepsRecipes for now. + SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; + auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList]( + VPBasicBlock *SinkTo, VPValue *Op) { + auto *Candidate = + dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()); + if (!Candidate) + return; + + // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes + // for now. if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) - return false; + return; if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || Candidate->mayReadOrWriteMemory()) - return false; + return; if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) if (!ScalarVFOnly && RepR->isSingleScalar()) - return false; + return; - return true; + WorkList.insert({SinkTo, Candidate}); }; // First, collect the operands of all recipes in replicate blocks as seeds for // sinking. - SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) { VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) @@ -182,14 +187,9 @@ static bool sinkScalarOperands(VPlan &Plan) { VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) continue; - for (auto &Recipe : *VPBB) { - for (VPValue *Op : Recipe.operands()) { - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - if (IsValidSinkCandidate(VPBB, Def)) - WorkList.insert({VPBB, Def}); - } - } + for (auto &Recipe : *VPBB) + for (VPValue *Op : Recipe.operands()) + InsertIfValidSinkCandidate(VPBB, Op); } // Try to sink each replicate or scalar IV steps recipe in the worklist. @@ -234,10 +234,7 @@ static bool sinkScalarOperands(VPlan &Plan) { } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - if (IsValidSinkCandidate(SinkTo, Def)) - WorkList.insert({SinkTo, Def}); + InsertIfValidSinkCandidate(SinkTo, Op); Changed = true; } return Changed; From ab487b6378cfd2f59b25eb945f07c5074b5217b8 Mon Sep 17 00:00:00 2001 From: Andrew Ng <andrew.ng@sony.com> Date: Tue, 4 Nov 2025 10:07:54 +0000 Subject: [PATCH 139/313] [BitcodeReader][NFC] Tidy getEnableSplitLTOUnitAndUnifiedFlag (#165732) --- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 34 ++++++----------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 466dcb02696f4..1bdf3e9f684f5 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -8566,16 +8566,13 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() { } static Expected<std::pair<bool, bool>> -getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, - unsigned ID, - BitcodeLTOInfo <OInfo) { +getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) { if (Error Err = Stream.EnterSubBlock(ID)) return std::move(Err); - SmallVector<uint64_t, 64> Record; + SmallVector<uint64_t, 64> Record; while (true) { BitstreamEntry Entry; - std::pair<bool, bool> Result = {false,false}; if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry)) return std::move(E); @@ -8584,8 +8581,8 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: { - // If no flags record found, set both flags to false. - return Result; + // If no flags record found, return both flags as false. + return std::make_pair(false, false); } case BitstreamEntry::Record: // The interesting case. @@ -8607,9 +8604,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, bool EnableSplitLTOUnit = Flags & 0x8; bool UnifiedLTO = Flags & 0x200; - Result = {EnableSplitLTOUnit, UnifiedLTO}; - - return Result; + return std::make_pair(EnableSplitLTOUnit, UnifiedLTO); } } } @@ -8638,26 +8633,15 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() { /*EnableSplitLTOUnit=*/false, /*UnifiedLTO=*/false}; case BitstreamEntry::SubBlock: - if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) { - BitcodeLTOInfo LTOInfo; + if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID || + Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); + getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID); if (!Flags) return Flags.takeError(); - std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = true; - LTOInfo.HasSummary = true; - return LTOInfo; - } - - if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { BitcodeLTOInfo LTOInfo; - Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); - if (!Flags) - return Flags.takeError(); std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = false; + LTOInfo.IsThinLTO = (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID); LTOInfo.HasSummary = true; return LTOInfo; } From 97947f150f55727c5ae0998218260561e28b946e Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi <srinivasar@nvidia.com> Date: Tue, 4 Nov 2025 15:38:59 +0530 Subject: [PATCH 140/313] [MLIR][NVVM] Update Op verifiers to prevent ungraceful exits (#165677) Updates the following Ops to prevent ungraceful exits with a stack-dump in certain cases of incorrect usages, and instead gracefully error out with a more informative error message: - `tcgen05.ld` - `shfl.sync` --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 9 +++- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 46 +++++++++++++++---- mlir/test/Dialect/LLVMIR/invalid.mlir | 6 +-- .../Target/LLVMIR/nvvm/shfl-sync-invalid.mlir | 22 +++++++++ .../LLVMIR/nvvm/tcgen05-ld-invalid.mlir | 9 ++++ 5 files changed, 78 insertions(+), 14 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 9be108d5d1056..5f87e5c07e56e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -1328,9 +1328,9 @@ def ShflKindAttr : EnumAttr<NVVM_Dialect, ShflKind, "shfl_kind">; def NVVM_ShflOp : NVVM_Op<"shfl.sync", [NVVMRequiresSM<30>]>, - Results<(outs LLVM_Type:$res)>, + Results<(outs AnyTypeOf<[I32, F32, LLVMStructType]>:$res)>, Arguments<(ins I32:$thread_mask, - LLVM_Type:$val, + AnyTypeOf<[I32, F32]>:$val, I32:$offset, I32:$mask_and_clamp, ShflKindAttr:$kind, @@ -1346,6 +1346,11 @@ def NVVM_ShflOp : a mask for logically splitting warps into sub-segments and an upper bound for clamping the source lane index. + The `return_value_and_is_valid` unit attribute can be specified to indicate + that the return value is a two-element struct, where the first element is + the result value and the second element is a predicate indicating if the + computed source lane index is valid. + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync) }]; string llvmBuilder = [{ diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index f2e55f255ceac..262d9b753a2d7 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -920,15 +920,40 @@ LogicalResult MmaOp::verify() { } LogicalResult ShflOp::verify() { - if (!(*this)->getAttrOfType<UnitAttr>("return_value_and_is_valid")) - return success(); - auto type = llvm::dyn_cast<LLVM::LLVMStructType>(getType()); - auto elementType = (type && type.getBody().size() == 2) - ? llvm::dyn_cast<IntegerType>(type.getBody()[1]) - : nullptr; - if (!elementType || elementType.getWidth() != 1) - return emitError("expected return type to be a two-element struct with " - "i1 as the second element"); + auto returnStructType = llvm::dyn_cast<LLVM::LLVMStructType>(getType()); + + auto verifyTypeError = [&](Twine desc, Type expectedType, + Type actualType) -> LogicalResult { + return emitOpError("expected " + desc + " to be of type ") + << expectedType << " but got " << actualType << " instead"; + }; + + if (returnStructType) { + if (!getReturnValueAndIsValid()) + return emitOpError("\"return_value_and_is_valid\" attribute must be " + "specified when the return type is a struct type"); + + if (returnStructType.getBody().size() != 2) + return emitOpError("expected return type to be a two-element struct"); + + llvm::ArrayRef<Type> returnStruct = returnStructType.getBody(); + auto resultType = returnStruct[0]; + if (resultType != getVal().getType()) + return verifyTypeError("first element in the returned struct", + getVal().getType(), resultType); + + auto predicateType = returnStruct[1]; + if (!predicateType.isInteger(1)) + return verifyTypeError("second element in the returned struct", + mlir::IntegerType::get(getContext(), 1), + predicateType); + } else { + if (getReturnValueAndIsValid()) + return emitOpError("expected return type to be a two-element struct"); + + if (getType() != getVal().getType()) + return verifyTypeError("return type", getVal().getType(), getType()); + } return success(); } @@ -2677,6 +2702,9 @@ LogicalResult Tcgen05LdOp::verify() { if (getShape() == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && !getOffset()) result = emitError("shape 16x32bx2 requires offset argument"); + if (getShape() != NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && getOffset()) + result = emitError("offset argument is only supported for shape 16x32bx2"); + auto resTy = getRes().getType(); unsigned resLen = isa<VectorType>(resTy) ? llvm::cast<VectorType>(resTy).getNumElements() diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir index aaf9f8024bfbe..49b6342aea538 100644 --- a/mlir/test/Dialect/LLVMIR/invalid.mlir +++ b/mlir/test/Dialect/LLVMIR/invalid.mlir @@ -664,21 +664,21 @@ func.func @zero_non_llvm_type() { // ----- func.func @nvvm_invalid_shfl_pred_1(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) { - // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}} + // expected-error@+1 {{expected return type to be a two-element struct}} %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> i32 } // ----- func.func @nvvm_invalid_shfl_pred_2(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) { - // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}} + // expected-error@+1 {{expected return type to be a two-element struct}} %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32)> } // ----- func.func @nvvm_invalid_shfl_pred_3(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) { - // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}} + // expected-error@+1 {{expected second element in the returned struct to be of type 'i1' but got 'i32' instead}} %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i32)> } diff --git a/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir new file mode 100644 index 0000000000000..f2ccfe71a3f23 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir @@ -0,0 +1,22 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +// ----- + +func.func @nvvm_invalid_shfl_pred(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) { + // expected-error@+1 {{"return_value_and_is_valid" attribute must be specified when the return type is a struct type}} + %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 : f32 -> !llvm.struct<(f32, i1)> +} + +// ----- + +func.func @nvvm_invalid_shfl_invalid_return_type_1(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) { + // expected-error@+1 {{expected return type to be of type 'f32' but got 'i32' instead}} + %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 : f32 -> i32 +} + +// ----- + +func.func @nvvm_invalid_shfl_invalid_return_type_2(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) { + // expected-error@+1 {{expected first element in the returned struct to be of type 'f32' but got 'i32' instead}} + %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 {return_value_and_is_valid} : f32 -> !llvm.struct<(i32, i1)> +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir new file mode 100644 index 0000000000000..1b93f20c15b99 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir @@ -0,0 +1,9 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +// ----- + +llvm.func @nvvm_tcgen05_ld_32x32b_offset(%tmemAddr : !llvm.ptr<6>, %offset : i64) -> () { + // expected-error@+1 {{offset argument is only supported for shape 16x32bx2}} + %ldv2 = nvvm.tcgen05.ld %tmemAddr, %offset { pack, shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32> + llvm.return +} From f7be258c28939b25f5f9ec07174ace0d81ca3eea Mon Sep 17 00:00:00 2001 From: Jinjie Huang <huangjinjie@bytedance.com> Date: Tue, 4 Nov 2025 18:27:53 +0800 Subject: [PATCH 141/313] [BOLT][NFC] Clean up the outdated option --write-dwp in doc (#166150) Since the "--write-dwp" option has been removed in [PR](https://github.com/llvm/llvm-project/pull/100771), this patch also cleans up the corresponding document and test to avoid misleading issues. --- bolt/docs/CommandLineArgumentReference.md | 5 ----- bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test | 5 +++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md index 43ceceee7de45..7c6e01d669b74 100644 --- a/bolt/docs/CommandLineArgumentReference.md +++ b/bolt/docs/CommandLineArgumentReference.md @@ -381,11 +381,6 @@ Set verbosity level for diagnostic output -- `--write-dwp` - - Output a single dwarf package file (dwp) instead of multiple non-relocatable - dwarf object files (dwo). - ### BOLT optimization options: - `--align-blocks` diff --git a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test index 673e86bb1533a..a08e352d605fe 100644 --- a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test +++ b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test @@ -1,4 +1,4 @@ -# UNSUPPORTED: true +# REQUIRES: system-linux ; RUN: rm -rf %t ; RUN: mkdir %t ; RUN: cd %t @@ -8,7 +8,8 @@ ; RUN: llvm-dwp -e main.exe -o main.exe.dwp ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.dwp | FileCheck -check-prefix=PRE-BOLT %s ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.dwp | FileCheck -check-prefix=PRE-BOLT-DWP-TU-INDEX %s -; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp +; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections +; RUN: llvm-dwp -e main.exe.bolt -o main.exe.bolt.dwp ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s From 5b2f9b53bdb348393bba221c5f69bfac179092c8 Mon Sep 17 00:00:00 2001 From: kper <kevin.per@protonmail.com> Date: Tue, 4 Nov 2025 11:35:40 +0100 Subject: [PATCH 142/313] [SimplifyCFG]: Switch on umin replaces default (#164097) A switch on `umin` can eliminate the default case by making the `umin`'s constant the default case. Proof: https://alive2.llvm.org/ce/z/_N6nfs Fixes: https://github.com/llvm/llvm-project/issues/162111 --- llvm/include/llvm/IR/Instructions.h | 5 + llvm/lib/IR/Instructions.cpp | 10 + llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 78 ++++++ .../Transforms/SimplifyCFG/switch-umin.ll | 246 ++++++++++++++++++ 4 files changed, 339 insertions(+) create mode 100644 llvm/test/Transforms/SimplifyCFG/switch-umin.ll diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 27930bbc651bd..8bd060ae8f485 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -3556,6 +3556,11 @@ class SwitchInstProfUpdateWrapper { /// correspondent branch weight. LLVM_ABI SwitchInst::CaseIt removeCase(SwitchInst::CaseIt I); + /// Replace the default destination by given case. Delegate the call to + /// the underlying SwitchInst::setDefaultDest and remove correspondent branch + /// weight. + LLVM_ABI void replaceDefaultDest(SwitchInst::CaseIt I); + /// Delegate the call to the underlying SwitchInst::addCase() and set the /// specified branch weight for the added case. LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest, CaseWeightOpt W); diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 3b8fde8aff45f..cd39970f5111f 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -4171,6 +4171,16 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) { return SI.removeCase(I); } +void SwitchInstProfUpdateWrapper::replaceDefaultDest(SwitchInst::CaseIt I) { + auto *DestBlock = I->getCaseSuccessor(); + if (Weights) { + auto Weight = getSuccessorWeight(I->getCaseIndex() + 1); + (*Weights)[0] = Weight.value(); + } + + SI.setDefaultDest(DestBlock); +} + void SwitchInstProfUpdateWrapper::addCase( ConstantInt *OnVal, BasicBlock *Dest, SwitchInstProfUpdateWrapper::CaseWeightOpt W) { diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index cbc604e87cf1a..bb733277e0fad 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7570,6 +7570,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, return true; } +/// Tries to transform the switch when the condition is umin with a constant. +/// In that case, the default branch can be replaced by the constant's branch. +/// This method also removes dead cases when the simplification cannot replace +/// the default branch. +/// +/// For example: +/// switch(umin(a, 3)) { +/// case 0: +/// case 1: +/// case 2: +/// case 3: +/// case 4: +/// // ... +/// default: +/// unreachable +/// } +/// +/// Transforms into: +/// +/// switch(a) { +/// case 0: +/// case 1: +/// case 2: +/// default: +/// // This is case 3 +/// } +static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) { + Value *A; + ConstantInt *Constant; + + if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant)))) + return false; + + SmallVector<DominatorTree::UpdateType> Updates; + SwitchInstProfUpdateWrapper SIW(*SI); + BasicBlock *BB = SIW->getParent(); + + // Dead cases are removed even when the simplification fails. + // A case is dead when its value is higher than the Constant. + for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) { + if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) { + ++I; + continue; + } + BasicBlock *DeadCaseBB = I->getCaseSuccessor(); + DeadCaseBB->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB}); + I = SIW->removeCase(I); + E = SIW->case_end(); + } + + auto Case = SI->findCaseValue(Constant); + // If the case value is not found, `findCaseValue` returns the default case. + // In this scenario, since there is no explicit `case 3:`, the simplification + // fails. The simplification also fails when the switch’s default destination + // is reachable. + if (!SI->defaultDestUnreachable() || Case == SI->case_default()) { + if (DTU) + DTU->applyUpdates(Updates); + return !Updates.empty(); + } + + BasicBlock *Unreachable = SI->getDefaultDest(); + SIW.replaceDefaultDest(Case); + SIW.removeCase(Case); + SIW->setCondition(A); + + Updates.push_back({DominatorTree::Delete, BB, Unreachable}); + + if (DTU) + DTU->applyUpdates(Updates); + + return true; +} + /// Tries to transform switch of powers of two to reduce switch range. /// For example, switch like: /// switch (C) { case 1: case 2: case 64: case 128: } @@ -8037,6 +8112,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (simplifyDuplicateSwitchArms(SI, DTU)) return requestResimplify(); + if (simplifySwitchWhenUMin(SI, DTU)) + return requestResimplify(); + return false; } diff --git a/llvm/test/Transforms/SimplifyCFG/switch-umin.ll b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll new file mode 100644 index 0000000000000..44665365dc222 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll @@ -0,0 +1,246 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s + +declare void @a() +declare void @b() +declare void @c() +declare void @d() + +define void @switch_replace_default(i32 %x) { +; CHECK-LABEL: define void @switch_replace_default( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[X]], label %[[COMMON_RET:.*]] [ +; CHECK-NEXT: i32 0, label %[[CASE0:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: ], !prof [[PROF0:![0-9]+]] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE0]]: +; CHECK-NEXT: call void @a() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %unreachable [ + i32 0, label %case0 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ], !prof !0 + +case0: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @switch_replace_default_and_remove_dead_cases(i32 %x) { +; CHECK-LABEL: define void @switch_replace_default_and_remove_dead_cases( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[X]], label %[[COMMON_RET:.*]] [ +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %unreachable [ + i32 4, label %case4 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ] + +case4: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @switch_replace_default_when_holes(i32 %x) { +; CHECK-LABEL: define void @switch_replace_default_when_holes( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[X]], label %[[COMMON_RET:.*]] [ +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %unreachable [ + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ] + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @do_not_switch_replace_default(i32 %x, i32 %y) { +; CHECK-LABEL: define void @do_not_switch_replace_default( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: switch i32 [[MIN]], label %[[UNREACHABLE:.*]] [ +; CHECK-NEXT: i32 0, label %[[CASE0:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: i32 3, label %[[COMMON_RET:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE0]]: +; CHECK-NEXT: call void @a() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[UNREACHABLE]]: +; CHECK-NEXT: unreachable +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 %y) + switch i32 %min, label %unreachable [ + i32 0, label %case0 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ] + +case0: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @do_not_replace_switch_default_but_remove_dead_cases(i32 %x) { +; CHECK-LABEL: define void @do_not_replace_switch_default_but_remove_dead_cases( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[MIN]], label %[[CASE0:.*]] [ +; CHECK-NEXT: i32 3, label %[[COMMON_RET:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE0]]: +; CHECK-NEXT: call void @a() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %case0 [ ; default is reachable, therefore simplification not triggered + i32 0, label %case0 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + i32 4, label %case4 + ] + +case0: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +case4: + call void @d() + ret void + +} + + +!0 = !{!"branch_weights", i32 1, i32 2, i32 3, i32 99, i32 5} +;. +; CHECK: [[PROF0]] = !{!"branch_weights", i32 5, i32 2, i32 3, i32 99} +;. From c4ac31dc52e016a413ac9e2fa98252c6ff643e3c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <llvm-dev@redking.me.uk> Date: Tue, 4 Nov 2025 10:48:49 +0000 Subject: [PATCH 143/313] [X86] narrowBitOpRMW - use reachesChainWithoutSideEffects instead of direct chain matching (#165870) This will allow us to match RMW load/store chains through TokenFactor nodes if there are additional loads in the chain before the store --- llvm/lib/Target/X86/X86ISelLowering.cpp | 40 +++-- llvm/test/CodeGen/X86/bittest-big-integer.ll | 152 ++++--------------- 2 files changed, 46 insertions(+), 146 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d4a4d4339f7e1..6edf0185df813 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53355,21 +53355,11 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { using namespace SDPatternMatch; - - // Only handle normal stores and its chain was a matching normal load. - auto *Ld = dyn_cast<LoadSDNode>(St->getChain()); - if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld || - !ISD::isNormalLoad(Ld) || !Ld->isSimple() || - Ld->getBasePtr() != St->getBasePtr() || - Ld->getOffset() != St->getOffset()) - return SDValue(); - - SDValue LoadVal(Ld, 0); SDValue StoredVal = St->getValue(); EVT VT = StoredVal.getValueType(); - // Only narrow larger than legal scalar integers. - if (!VT.isScalarInteger() || + // Only narrow normal stores of larger than legal scalar integers. + if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() || VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) return SDValue(); @@ -53378,18 +53368,26 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, // BTC: X ^ (1 << ShAmt) // // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) - SDValue InsertBit, ShAmt; + SDValue SrcVal, InsertBit, ShAmt; if (!StoredVal.hasOneUse() || - !(sd_match(StoredVal, m_And(m_Specific(LoadVal), + !(sd_match(StoredVal, m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, - m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || + m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || sd_match(StoredVal, - m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) || - sd_match(StoredVal, - m_Or(m_And(m_Specific(LoadVal), - m_Not(m_Shl(m_One(), m_Value(ShAmt)))), - m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match( + StoredVal, + m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + return SDValue(); + + // SrcVal must be a matching normal load further up the chain. + auto *Ld = dyn_cast<LoadSDNode>(SrcVal); + if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset() || + !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1))) return SDValue(); // Ensure the shift amount is in bounds. @@ -53423,7 +53421,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, SDNodeFlags::NoUnsignedWrap); // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. - SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt); + SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt); X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 87a54a0b9148d..c197a83835506 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -1029,144 +1029,46 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-LABEL: reset_multiload_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 40(%esp,%eax), %edx -; X86-NEXT: movl 44(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl 32(%esp,%eax), %edi -; X86-NEXT: movl 36(%esp,%eax), %ebx -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: notl %ebx -; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl (%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: andl %ebx, 4(%eax) -; X86-NEXT: shll %cl, %edi -; X86-NEXT: notl %edi -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl $96, %ebx -; X86-NEXT: shrl $3, %ebx -; X86-NEXT: movl (%eax,%ebx), %ebx -; X86-NEXT: andl %edi, (%eax) -; X86-NEXT: notl %esi -; X86-NEXT: andl %esi, 12(%eax) -; X86-NEXT: notl %edx -; X86-NEXT: andl %edx, 8(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: btl %ecx, %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: btrl %edx, %ebx +; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl %ebx, (%ecx,%esi) ; X86-NEXT: jae .LBB22_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB22_2: -; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: reset_multiload_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %esi -; SSE-NEXT: xorl %r8d, %r8d -; SSE-NEXT: shldq %cl, %rsi, %r8 -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rsi, %r8 -; SSE-NEXT: cmovneq %rax, %rsi -; SSE-NEXT: notq %r8 -; SSE-NEXT: notq %rsi -; SSE-NEXT: movl %ecx, %r9d -; SSE-NEXT: andl $96, %r9d -; SSE-NEXT: shrl $3, %r9d -; SSE-NEXT: movl (%rdi,%r9), %r9d -; SSE-NEXT: btl %ecx, %r9d -; SSE-NEXT: jb .LBB22_2 -; SSE-NEXT: # %bb.1: -; SSE-NEXT: movl (%rdx), %eax -; SSE-NEXT: .LBB22_2: -; SSE-NEXT: andq %rsi, (%rdi) -; SSE-NEXT: andq %r8, 8(%rdi) -; SSE-NEXT: # kill: def $eax killed $eax killed $rax -; SSE-NEXT: retq -; -; AVX2-LABEL: reset_multiload_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: movl $1, %r8d -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %r8, %rsi -; AVX2-NEXT: shlxq %rcx, %r8, %r8 -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %r8, %rsi -; AVX2-NEXT: cmovneq %rax, %r8 -; AVX2-NEXT: notq %rsi -; AVX2-NEXT: notq %r8 -; AVX2-NEXT: movl %ecx, %r9d -; AVX2-NEXT: andl $96, %r9d -; AVX2-NEXT: shrl $3, %r9d -; AVX2-NEXT: movl (%rdi,%r9), %r9d -; AVX2-NEXT: btl %ecx, %r9d -; AVX2-NEXT: jb .LBB22_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: movl (%rdx), %eax -; AVX2-NEXT: .LBB22_2: -; AVX2-NEXT: andq %r8, (%rdi) -; AVX2-NEXT: andq %rsi, 8(%rdi) -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax -; AVX2-NEXT: retq -; -; AVX512-LABEL: reset_multiload_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %r8d -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %r8, %rsi -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: shlxq %rcx, %r8, %r8 -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %r8, %rsi -; AVX512-NEXT: cmovneq %rax, %r8 -; AVX512-NEXT: notq %rsi -; AVX512-NEXT: notq %r8 -; AVX512-NEXT: movl %ecx, %r9d -; AVX512-NEXT: andl $96, %r9d -; AVX512-NEXT: shrl $3, %r9d -; AVX512-NEXT: movl (%rdi,%r9), %r9d -; AVX512-NEXT: btl %ecx, %r9d -; AVX512-NEXT: jb .LBB22_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl (%rdx), %eax -; AVX512-NEXT: .LBB22_2: -; AVX512-NEXT: andq %r8, (%rdi) -; AVX512-NEXT: andq %rsi, 8(%rdi) -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: retq +; X64-LABEL: reset_multiload_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %r9d +; X64-NEXT: movl %r9d, %r8d +; X64-NEXT: btrl %esi, %r8d +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: btl %esi, %r9d +; X64-NEXT: jb .LBB22_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: .LBB22_2: +; X64-NEXT: movl %r8d, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs From 85c78274358717e4d5d019a801decba5c1add484 Mon Sep 17 00:00:00 2001 From: Dan Blackwell <dan_blackwell@apple.com> Date: Tue, 4 Nov 2025 11:03:59 +0000 Subject: [PATCH 144/313] [ASan][Test-Only] Allow read of size 2 in strcmp.c test (#166179) Occasionally this test fails on Darwin due to `CHECK: READ of size 1` not matching the actual output `CHECK: READ of size 2`. This can happen when the memory before the string `s1` happens to match the first character of `s2`. This patch allows for `READ of size 2` to pass in order to account for the above circumstances. rdar://151317947 --- compiler-rt/test/asan/TestCases/strcmp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/compiler-rt/test/asan/TestCases/strcmp.c b/compiler-rt/test/asan/TestCases/strcmp.c index 417bd491ebe02..2b31e64768c42 100644 --- a/compiler-rt/test/asan/TestCases/strcmp.c +++ b/compiler-rt/test/asan/TestCases/strcmp.c @@ -14,6 +14,8 @@ int main(int argc, char **argv) { assert(strcmp(s1 - 1, s2)); // CHECK: {{.*ERROR: AddressSanitizer: stack-buffer-underflow on address}} - // CHECK: READ of size 1 + // Very rarely `s1[-1]` happens to be '1', resulting in `strcmp` needing to + // check 2 bytes before failing, rather than 1 - this should still pass + // CHECK: READ of size {{[12]}} return 0; } From f8656ed4557500345ba29128d9ec85ef1a637240 Mon Sep 17 00:00:00 2001 From: Michael Buch <michaelbuch12@gmail.com> Date: Tue, 4 Nov 2025 11:37:56 +0000 Subject: [PATCH 145/313] [llvm][dwarfdump] Add --child-tags option to filter by DWARF tags (#165720) This patch adds a new option `--child-tags` (`-t` for short), which makes dwarfdump only dump children whose DWARF tag is in the list of tags specified by the user. Motivating examples are: * dumping all global variables in a CU * dumping all non-static data members of a structure * dumping all module import declarations of a CU * etc. For tags not known to dwarfdump, we pretend that the tag wasn't specified. Note, this flag only takes effect when `--show-children` is set (either explicitly or implicitly). We error out when trying to use the flag without dumping children. Example: ``` $ builds/release/bin/llvm-dwarfdump -t DW_TAG_structure_type a.out.dSYM ... 0x0000000c: DW_TAG_compile_unit DW_AT_producer ("clang version 22.0.0git (git@github.com:Michael137/llvm-project.git 737da3347c2fb01dd403420cf83e9b8fbea32618)") DW_AT_language (DW_LANG_C11) ... 0x0000002a: DW_TAG_structure_type DW_AT_APPLE_block (true) DW_AT_byte_size (0x20) 0x00000067: DW_TAG_structure_type DW_AT_APPLE_block (true) DW_AT_name ("__block_descriptor") DW_AT_byte_size (0x10) ... ``` ``` $ builds/release/bin/llvm-dwarfdump -t DW_TAG_structure_type -t DW_TAG_member a.out.dSYM ... 0x0000000c: DW_TAG_compile_unit DW_AT_producer ("clang version 22.0.0git (git@github.com:Michael137/llvm-project.git 737da3347c2fb01dd403420cf83e9b8fbea32618)") DW_AT_language (DW_LANG_C11) DW_AT_name ("macro.c") ... 0x0000002a: DW_TAG_structure_type DW_AT_APPLE_block (true) DW_AT_byte_size (0x20) 0x0000002c: DW_TAG_member DW_AT_name ("__isa") DW_AT_type (0x00000051 "void *") DW_AT_data_member_location (0x00) 0x00000033: DW_TAG_member DW_AT_name ("__flags") DW_AT_type (0x00000052 "int") DW_AT_data_member_location (0x08) 0x0000003a: DW_TAG_member DW_AT_name ("__reserved") DW_AT_type (0x00000052 "int") DW_AT_data_member_location (0x0c) 0x00000041: DW_TAG_member DW_AT_name ("__FuncPtr") DW_AT_type (0x00000056 "void (*)(int)") DW_AT_data_member_location (0x10) 0x00000048: DW_TAG_member DW_AT_name ("__descriptor") DW_AT_type (0x00000062 "__block_descriptor *") DW_AT_alignment (8) DW_AT_data_member_location (0x18) 0x00000067: DW_TAG_structure_type DW_AT_APPLE_block (true) DW_AT_name ("__block_descriptor") DW_AT_byte_size (0x10) 0x0000006a: DW_TAG_member DW_AT_name ("reserved") DW_AT_type (0x00000079 "unsigned long") DW_AT_data_member_location (0x00) 0x00000071: DW_TAG_member DW_AT_name ("Size") DW_AT_type (0x00000079 "unsigned long") DW_AT_data_member_location (0x08) ... ``` --- llvm/docs/CommandGuide/llvm-dwarfdump.rst | 9 ++ llvm/include/llvm/DebugInfo/DIContext.h | 2 + llvm/lib/DebugInfo/DWARF/DWARFDie.cpp | 4 +- .../llvm-dwarfdump/X86/filter-child-tag.yaml | 136 ++++++++++++++++++ llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp | 18 +++ 5 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml diff --git a/llvm/docs/CommandGuide/llvm-dwarfdump.rst b/llvm/docs/CommandGuide/llvm-dwarfdump.rst index 137830259eb64..dfc0431f07826 100644 --- a/llvm/docs/CommandGuide/llvm-dwarfdump.rst +++ b/llvm/docs/CommandGuide/llvm-dwarfdump.rst @@ -134,6 +134,15 @@ OPTIONS Abbreviate the description of type unit entries. +.. option:: -t, --filter-child-tag + + Only dump children whose DWARF tag is one of the specified tags. + Example usage: + + .. code-block:: c + + llvm-dwarfdump -t DW_TAG_structure_type -t DW_TAG_member -c + .. option:: -x, --regex Treat any <name> strings as regular expressions when searching diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h index e7e87bbfebf38..b404c92e71836 100644 --- a/llvm/include/llvm/DebugInfo/DIContext.h +++ b/llvm/include/llvm/DebugInfo/DIContext.h @@ -211,6 +211,8 @@ struct DIDumpOptions { bool ShowAggregateErrors = false; bool PrintRegisterOnly = false; std::string JsonErrSummaryFile; + /// List of DWARF tags to filter children by. + llvm::SmallVector<unsigned, 0> FilterChildTag; std::function<llvm::StringRef(uint64_t DwarfRegNum, bool IsEH)> GetNameForDWARFReg; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 6c78ef05e1b61..7496c5a084da4 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -704,7 +704,9 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent, DIDumpOptions ChildDumpOpts = DumpOpts; ChildDumpOpts.ShowParents = false; while (Child) { - Child.dump(OS, Indent + 2, ChildDumpOpts); + if (DumpOpts.FilterChildTag.empty() || + llvm::is_contained(DumpOpts.FilterChildTag, Child.getTag())) + Child.dump(OS, Indent + 2, ChildDumpOpts); Child = Child.getSibling(); } } diff --git a/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml new file mode 100644 index 0000000000000..2a8c37da80e64 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml @@ -0,0 +1,136 @@ +## Tests the --filter-child-tag (-t) option. + +# RUN: yaml2obj %s -o %t.o + +# RUN: llvm-dwarfdump %t.o --filter-child-tag=DW_TAG_structure_type | FileCheck %s --check-prefix=ONLY_STRUCT + +# ONLY_STRUCT: DW_TAG_compile_unit +# ONLY_STRUCT-NOT: DW_TAG_namespace +# ONLY_STRUCT-NOT: DW_TAG_structure_type + +# RUN: llvm-dwarfdump %t.o -t DW_TAG_structure_type -t DW_TAG_namespace | \ +# RUN: FileCheck %s --check-prefix=STRUCT_AND_NS --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_member + +# STRUCT_AND_NS: DW_TAG_compile_unit +# STRUCT_AND_NS: DW_TAG_namespace +# STRUCT_AND_NS: DW_TAG_structure_type +# STRUCT_AND_NS: DW_TAG_structure_type + +# RUN: llvm-dwarfdump %t.o -c --name=Foo -t DW_TAG_member | \ +# RUN: FileCheck %s --check-prefix=FOO_MEM --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace + +# FOO_MEM: DW_TAG_structure_type +# FOO_MEM: DW_TAG_member +# FOO_MEM: DW_TAG_member +# FOO_MEM: DW_TAG_member +# FOO_MEM-NOT: DW_TAG_structure_type +# FOO_MEM-NOT: DW_TAG_member + +# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag -t DW_TAG_member | \ +# RUN: FileCheck %s --check-prefix=SINGLE_INVALID_TAG --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace + +# SINGLE_INVALID_TAG: DW_TAG_structure_type +# SINGLE_INVALID_TAG: DW_TAG_member +# SINGLE_INVALID_TAG: DW_TAG_member +# SINGLE_INVALID_TAG: DW_TAG_member +# SINGLE_INVALID_TAG-NOT: DW_TAG_structure_type +# SINGLE_INVALID_TAG-NOT: DW_TAG_member + +# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag | \ +# RUN: FileCheck %s --check-prefix=ONLY_INVALID_TAGS --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace --implicit-check-not=DW_TAG_member + +# ONLY_INVALID_TAGS: DW_TAG_structure_type +# ONLY_INVALID_TAGS-NOT: DW_TAG_structure_type + +# RUN: llvm-dwarfdump %t.o -c -p --name=Foo -t DW_TAG_member | \ +# RUN: FileCheck %s --check-prefix=FOO_MEM_WITH_PARENT --implicit-check-not=DW_TAG_subprogram + +# FOO_MEM_WITH_PARENT: DW_TAG_compile_unit +# FOO_MEM_WITH_PARENT: DW_TAG_namespace +# FOO_MEM_WITH_PARENT: DW_TAG_structure_type +# FOO_MEM_WITH_PARENT: DW_TAG_member +# FOO_MEM_WITH_PARENT: DW_TAG_member +# FOO_MEM_WITH_PARENT: DW_TAG_member +# FOO_MEM_WITH_PARENT-NOT: DW_TAG_structure_type +# FOO_MEM_WITH_PARENT-NOT: DW_TAG_member + +## Not specifying --show-children ignores the --filter-child-tag option. +# RUN: llvm-dwarfdump %t.o --name=Foo -t DW_TAG_member 2>&1 | FileCheck %s --check-prefix=NO_SHOW_CHILDREN + +# NO_SHOW_CHILDREN: DW_TAG_structure_type + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +DWARF: + debug_abbrev: + - Table: + - Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_producer + Form: DW_FORM_string + - Tag: DW_TAG_namespace + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_member + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + debug_info: + - Version: 5 + UnitType: DW_UT_compile + Entries: + - AbbrCode: 1 + Values: + - CStr: handwritten + - AbbrCode: 2 + Values: + - CStr: ns + - AbbrCode: 3 + Values: + - CStr: Foo + - AbbrCode: 4 + Values: + - CStr: mem1 + - AbbrCode: 4 + Values: + - CStr: mem2 + - AbbrCode: 4 + Values: + - CStr: mem3 + - AbbrCode: 3 + Values: + - CStr: NestedInFoo + - AbbrCode: 4 + Values: + - CStr: NestedMem1 + - AbbrCode: 4 + Values: + - CStr: NestedMem2 + - AbbrCode: 5 + Values: + - CStr: NestedFunc + - AbbrCode: 0x0 + - AbbrCode: 5 + Values: + - CStr: FooFunc + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0x0 diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp index 11eb58ea911df..6f120f93700f6 100644 --- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp +++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVectorExtras.h" #include "llvm/ADT/StringSet.h" #include "llvm/DebugInfo/DIContext.h" #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h" @@ -242,6 +243,15 @@ static opt<bool> cat(DwarfDumpCategory)); static alias ShowParentsAlias("p", desc("Alias for --show-parents."), aliasopt(ShowParents), cl::NotHidden); + +static list<std::string> FilterChildTag( + "filter-child-tag", + desc("When --show-children is specified, show only DIEs with the " + "specified DWARF tags."), + value_desc("list of DWARF tags"), cat(DwarfDumpCategory)); +static alias FilterChildTagAlias("t", desc("Alias for --filter-child-tag."), + aliasopt(FilterChildTag), cl::NotHidden); + static opt<bool> ShowForm("show-form", desc("Show DWARF form types after the DWARF attribute types."), @@ -330,6 +340,13 @@ static cl::extrahelp /// @} //===----------------------------------------------------------------------===// +static llvm::SmallVector<unsigned> +makeTagVector(const list<std::string> &TagStrings) { + return llvm::map_to_vector(TagStrings, [](const std::string &Tag) { + return llvm::dwarf::getTag(Tag); + }); +} + static void error(Error Err) { if (!Err) return; @@ -356,6 +373,7 @@ static DIDumpOptions getDumpOpts(DWARFContext &C) { DumpOpts.ShowAddresses = !Diff; DumpOpts.ShowChildren = ShowChildren; DumpOpts.ShowParents = ShowParents; + DumpOpts.FilterChildTag = makeTagVector(FilterChildTag); DumpOpts.ShowForm = ShowForm; DumpOpts.SummarizeTypes = SummarizeTypes; DumpOpts.Verbose = Verbose; From c80faaefe10fcf3bac1c41b78e038d8d7533dd71 Mon Sep 17 00:00:00 2001 From: Michael Buch <michaelbuch12@gmail.com> Date: Tue, 4 Nov 2025 11:33:11 +0000 Subject: [PATCH 146/313] [lldb][test] registration-unique.test: don't hardcode the order of recognizer registration These might get registered in a different order on different platforms. Use `CHECK-DAG` to account for that. Attempts to fix the failure seen on the x86 Linux bots: ``` ******************** TEST 'lldb-shell :: Recognizer/registration-unique.test' FAILED ******************** Exit Code: 1 Command Output (stdout): -- RUN: at line 4 lit-file C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp executed command: split-file 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp' note: command had no output on stdout or stderr RUN: at line 6 \users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\clang.exe --target=specify-a-target-or-use-a-_host-substitution --target=aarch64-pc-windows-msvc -fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/main.cpp -g -o C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\clang.exe' --target=specify-a-target-or-use-a-_host-substitution --target=aarch64-pc-windows-msvc '-fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/main.cpp' -g -o 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out' .---command stderr------------ | clang: warning: argument unused during compilation: '-fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell' [-Wunused-command-line-argument] `----------------------------- RUN: at line 7 \users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\lldb.exe --no-lldbinit -S C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/tools/lldb\test\Shell\lit-lldb-init-quiet -b -s C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/commands.input C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out | c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\filecheck.exe C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\lldb.exe' --no-lldbinit -S 'C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/tools/lldb\test\Shell\lit-lldb-init-quiet' -b -s 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/commands.input' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out' note: command had no output on stdout or stderr executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\filecheck.exe' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test' .---command stderr------------ | C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test:45:10: error: CHECK: expected string not found in input | # CHECK: Assert StackFrame Recognizer | ^ | <stdin>:20:38: note: scanning from here | 1: Verbose Trap StackFrame Recognizer, demangled symbol regex ^__clang_trap_msg | ^ | <stdin>:34:10: note: possible intended match here | 3: Verbose Trap StackFrame Recognizer, demangled symbol regex ^__clang_trap_msg | ^ | | Input file: <stdin> | Check file: C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test | | -dump-input=help explains the following input dump. | | Input was: | <<<<<< ``` --- lldb/test/Shell/Recognizer/registration-unique.test | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lldb/test/Shell/Recognizer/registration-unique.test b/lldb/test/Shell/Recognizer/registration-unique.test index f6052bb57c940..bc1e4a6ea235b 100644 --- a/lldb/test/Shell/Recognizer/registration-unique.test +++ b/lldb/test/Shell/Recognizer/registration-unique.test @@ -41,14 +41,14 @@ frame recognizer list # CHECK-NEXT: no matching results found. # CHECK: frame recognizer list -# CHECK: Verbose Trap StackFrame Recognizer -# CHECK: Assert StackFrame Recognizer +# CHECK-DAG: Verbose Trap StackFrame Recognizer +# CHECK-DAG: Assert StackFrame Recognizer # CHECK-NOT: Verbose Trap StackFrame Recognizer # CHECK-NOT: Assert StackFrame Recognizer # FIXME: avoid duplicate frame recognizers in the target: https://github.com/llvm/llvm-project/issues/166341 # CHECK: frame recognizer list -# CHECK: Verbose Trap StackFrame Recognizer -# CHECK: Assert StackFrame Recognizer -# CHECK: Verbose Trap StackFrame Recognizer -# CHECK: Assert StackFrame Recognizer +# CHECK-DAG: Verbose Trap StackFrame Recognizer +# CHECK-DAG: Assert StackFrame Recognizer +# CHECK-DAG: Verbose Trap StackFrame Recognizer +# CHECK-DAG: Assert StackFrame Recognizer From 0c653514cc2f7e1005fadc330a3f3f904a5a7a52 Mon Sep 17 00:00:00 2001 From: Pablo Antonio Martinez <pamartin@amd.com> Date: Tue, 4 Nov 2025 12:47:18 +0100 Subject: [PATCH 147/313] [mlir][rocdl] Add GlobalLoadAsyncToLDS operation (#165374) Adds `global.load.async.to.lds` op to rocdl, supporting `b8`, `b32`, `b64` and `b128`. The op is lowered to the appropriate `llvm.amdgcn.global.load.async.to.lds.bXX` intrinsic. This is available on gfx1250+ --- mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 32 ++++++++++++++++++++ mlir/test/Dialect/LLVMIR/rocdl.mlir | 13 ++++++++ mlir/test/Target/LLVMIR/rocdl.mlir | 13 ++++++++ 3 files changed, 58 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 5241f9a6f2b43..921fdf36a59b0 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -692,6 +692,38 @@ def ROCDL_GlobalLoadLDSOp : }]; } +//===---------------------------------------------------------------------===// +// Async load to LDS intrinsic (available in GFX1250) +//===---------------------------------------------------------------------===// + +foreach bitsVal = [8, 32, 64, 128] in { + defvar bitsStr = "b" # !cast<string>(bitsVal); + def ROCDL_GlobalLoadAsyncToLDS # !toupper(bitsStr) # Op : + ROCDL_IntrOp<"global.load.async.to.lds." # bitsStr, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> { + dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr, + Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr, + I32Attr:$offset, + I32Attr:$aux); + let arguments = !con(args, baseArgs); + let assemblyFormat = [{ + $globalPtr `,` $ldsPtr `,` $offset `,` $aux + attr-dict `:` type($globalPtr) `,` type($ldsPtr) + }]; + let description = [{ + Asynchronously loads }] # !cast<string>(bitsVal) # [{ bits of data from a global memory pointer + to a Local Data Share (LDS) pointer. + + Available on gfx1250+. + }]; + + let extraClassDefinition = [{ + ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { + return {getGlobalPtr(), getLdsPtr()}; + } + }]; + } +} + //===---------------------------------------------------------------------===// // Tensor load/store intrinsics (available in GFX1250) //===---------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index e703600c71c8e..5e857599b65ea 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -664,6 +664,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { llvm.return } +llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { + // CHECK-LABEL @rocdl.global.load.async.to.lds + // CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0 + // CHECK: rocdl.global.load.async.to.lds.b32 %{{.*}}, %{{.*}}, 0, 0 + // CHECK: rocdl.global.load.async.to.lds.b64 %{{.*}}, %{{.*}}, 0, 0 + // CHECK: rocdl.global.load.async.to.lds.b128 %{{.*}}, %{{.*}}, 0, 0 + rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : <1>, <3> + rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : <1>, <3> + rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : <1>, <3> + rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : <1>, <3> + llvm.return +} + // CHECK-LABEL @rocdl.tensor.load.to.lds llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>, %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) { diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 8a848221a50dd..3fbd9e0567948 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -1040,6 +1040,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { llvm.return } +// CHECK-LABEL: rocdl.global.load.async.to.lds +llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { + // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8 + rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b32 + rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b64 + rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b128 + rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + llvm.return +} + // CHECK-LABEL: rocdl.tensor.load.to.lds llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>, %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) { From 99a1fcad5de62d849cc5ac31a8b3412ead5b5518 Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.foad@amd.com> Date: Tue, 4 Nov 2025 11:59:43 +0000 Subject: [PATCH 148/313] [UTC] Update AMDGPU asm regexp for private functions (#166169) Since #163011 changed AMDGPU to use ELF mangling, the regexp failed to match private functions because of the inconsistent presence/absence of the .L prefix on the first line of the function e.g.: ``` .Lfoo: ; @foo ``` --- llvm/test/CodeGen/AMDGPU/private-function.ll | 16 ++++++++++++++++ llvm/utils/UpdateTestChecks/asm.py | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/private-function.ll diff --git a/llvm/test/CodeGen/AMDGPU/private-function.ll b/llvm/test/CodeGen/AMDGPU/private-function.ll new file mode 100644 index 0000000000000..8eefc9dfc5d7e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/private-function.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define private void @foo() { +; CHECK-LABEL: foo: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +@var = global ptr @foo diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py index 61f0d679f806d..82377862885c8 100644 --- a/llvm/utils/UpdateTestChecks/asm.py +++ b/llvm/utils/UpdateTestChecks/asm.py @@ -51,9 +51,9 @@ class string: ) ASM_FUNCTION_AMDGPU_RE = re.compile( - r"\.type\s+_?(?P<func>[^,\n]+),@function\n" + r"\.type\s+(_|\.L)?(?P<func>[^,\n]+),@function\n" r"(^\s*\.amdgpu_hsa_kernel (?P=func)\n)?" - r'^_?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n' + r'^(_|\.L)?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n' r"(?P<body>.*?)\n" # (body of the function) # This list is incomplete r"^\s*(\.Lfunc_end[0-9]+:\n|\.section)", From f59beca459481554aef0d069f4ef4a0a66880593 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <llvm-dev@redking.me.uk> Date: Tue, 4 Nov 2025 12:00:24 +0000 Subject: [PATCH 149/313] [X86] bittest-big-integer.ll - add test showing dependent BTC/BT/BTS sequence on same i128 all with unknown bit indices (#166351) --- llvm/test/CodeGen/X86/bittest-big-integer.ll | 234 +++++++++++++++++++ 1 file changed, 234 insertions(+) diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index c197a83835506..040ae65a33251 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -1082,3 +1082,237 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { %ret = select i1 %cmp, i32 %sel, i32 0 ret i32 %ret } + +; BTC/BT/BTS sequence on same i128 +define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind { +; X86-LABEL: sequence_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $144, %esp +; X86-NEXT: movb 20(%ebp), %ch +; X86-NEXT: movb 12(%ebp), %cl +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %edx +; X86-NEXT: movl 60(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movb %ch, %al +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 84(%esp,%eax), %edx +; X86-NEXT: movl 88(%esp,%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl 20(%ebp), %ecx +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 92(%esp,%eax), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: xorl 8(%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: xorl 12(%eax), %esi +; X86-NEXT: xorl (%eax), %edi +; X86-NEXT: xorl 4(%eax), %ebx +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: andb $96, %al +; X86-NEXT: shrb $3, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 96(%esp,%eax), %eax +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setae %al +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, (%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: sequence_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %r8d +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %r8, %rsi +; SSE-NEXT: movl $1, %r9d +; SSE-NEXT: shlq %cl, %r9 +; SSE-NEXT: xorl %r11d, %r11d +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %r9, %rsi +; SSE-NEXT: cmovneq %r11, %r9 +; SSE-NEXT: xorl %r10d, %r10d +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shldq %cl, %r8, %r10 +; SSE-NEXT: shlq %cl, %r8 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: cmovneq %r8, %r10 +; SSE-NEXT: cmovneq %r11, %r8 +; SSE-NEXT: xorq 8(%rdi), %rsi +; SSE-NEXT: xorq (%rdi), %r9 +; SSE-NEXT: movl %edx, %ecx +; SSE-NEXT: andb $32, %cl +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: shrdq %cl, %rsi, %rax +; SSE-NEXT: movq %rsi, %r11 +; SSE-NEXT: shrq %cl, %r11 +; SSE-NEXT: testb $64, %dl +; SSE-NEXT: cmoveq %rax, %r11 +; SSE-NEXT: btl %edx, %r11d +; SSE-NEXT: setae %al +; SSE-NEXT: orq %r10, %rsi +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: movq %r9, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: retq +; +; AVX2-LABEL: sequence_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: movl $1, %r10d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %r10, %rsi +; AVX2-NEXT: shlxq %rcx, %r10, %r8 +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %r8, %rsi +; AVX2-NEXT: cmovneq %r9, %r8 +; AVX2-NEXT: xorl %r11d, %r11d +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shldq %cl, %r10, %r11 +; AVX2-NEXT: shlxq %rax, %r10, %r10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: cmovneq %r10, %r11 +; AVX2-NEXT: cmovneq %r9, %r10 +; AVX2-NEXT: xorq 8(%rdi), %rsi +; AVX2-NEXT: xorq (%rdi), %r8 +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: andb $32, %cl +; AVX2-NEXT: movq %r8, %rax +; AVX2-NEXT: shrdq %cl, %rsi, %rax +; AVX2-NEXT: shrxq %rcx, %rsi, %rcx +; AVX2-NEXT: testb $64, %dl +; AVX2-NEXT: cmoveq %rax, %rcx +; AVX2-NEXT: btl %edx, %ecx +; AVX2-NEXT: setae %al +; AVX2-NEXT: orq %r11, %rsi +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: sequence_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl $1, %r9d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %r9, %rsi +; AVX512-NEXT: xorl %r10d, %r10d +; AVX512-NEXT: shlxq %rcx, %r9, %r8 +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %r8, %rsi +; AVX512-NEXT: cmovneq %r10, %r8 +; AVX512-NEXT: xorl %r11d, %r11d +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: shldq %cl, %r9, %r11 +; AVX512-NEXT: shlxq %rax, %r9, %r9 +; AVX512-NEXT: testb $64, %al +; AVX512-NEXT: cmovneq %r9, %r11 +; AVX512-NEXT: cmovneq %r10, %r9 +; AVX512-NEXT: xorq 8(%rdi), %rsi +; AVX512-NEXT: xorq (%rdi), %r8 +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: andb $32, %cl +; AVX512-NEXT: movq %r8, %rax +; AVX512-NEXT: shrdq %cl, %rsi, %rax +; AVX512-NEXT: shrxq %rcx, %rsi, %rcx +; AVX512-NEXT: testb $64, %dl +; AVX512-NEXT: cmoveq %rax, %rcx +; AVX512-NEXT: btl %edx, %ecx +; AVX512-NEXT: setae %al +; AVX512-NEXT: orq %r11, %rsi +; AVX512-NEXT: orq %r9, %r8 +; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: movq %rsi, 8(%rdi) +; AVX512-NEXT: retq + %rem0 = and i32 %pos0, 127 + %rem1 = and i32 %pos1, 127 + %rem2 = and i32 %pos2, 127 + %ofs0 = zext nneg i32 %rem0 to i128 + %ofs1 = zext nneg i32 %rem1 to i128 + %ofs2 = zext nneg i32 %rem2 to i128 + %bit0 = shl nuw i128 1, %ofs0 + %bit1 = shl nuw i128 1, %ofs1 + %bit2 = shl nuw i128 1, %ofs2 + %ld = load i128, ptr %word + %res0 = xor i128 %ld, %bit0 + %test1 = and i128 %res0, %bit1 + %cmp1 = icmp eq i128 %test1, 0 + %res2 = or i128 %res0, %bit2 + store i128 %res2, ptr %word + ret i1 %cmp1 +} From f037f413506af9e32898e102f391175a3a2852ef Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.foad@amd.com> Date: Tue, 4 Nov 2025 12:00:44 +0000 Subject: [PATCH 150/313] [IR] Add new function attribute nocreateundeforpoison (#164809) Also add a corresponding intrinsic property that can be used to mark intrinsics that do not introduce poison, for example simple arithmetic intrinsics that propagate poison just like a simple arithmetic instruction. As a smoke test this patch adds the new property to llvm.amdgcn.fmul.legacy. --- clang/test/CodeGen/X86/math-builtins.c | 322 +++++++++--------- clang/test/CodeGen/builtin-sqrt.c | 2 +- clang/test/CodeGen/libcalls.c | 20 +- clang/test/CodeGen/math-libcalls.c | 232 ++++++------- .../cl20-device-side-enqueue-attributes.cl | 2 +- llvm/docs/LangRef.rst | 5 + llvm/include/llvm/Bitcode/LLVMBitCodes.h | 1 + llvm/include/llvm/IR/Attributes.td | 5 + llvm/include/llvm/IR/Intrinsics.td | 85 +++-- llvm/lib/Analysis/ValueTracking.cpp | 76 +---- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 + llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 2 + llvm/lib/Transforms/Utils/CodeExtractor.cpp | 1 + llvm/test/Bitcode/attributes.ll | 6 + .../AArch64/replace-with-veclib-armpl.ll | 3 +- .../AMDGPU/amdgpu-simplify-libcall-rootn.ll | 2 +- .../replace-with-veclib-sleef-scalable.ll | 2 +- llvm/test/Feature/intrinsics.ll | 5 +- llvm/test/Linker/drop-attribute.ll | 2 +- llvm/test/Transforms/Attributor/nofree.ll | 6 +- llvm/test/Transforms/Attributor/nosync.ll | 2 +- llvm/test/Transforms/Attributor/willreturn.ll | 16 +- .../EarlyCSE/replace-calls-def-attrs.ll | 2 +- llvm/test/Transforms/LoopIdiom/basic.ll | 2 +- .../AArch64/expand-exp.ll | 2 +- .../Transforms/SimplifyCFG/rangereduce.ll | 2 +- .../remove-attributes-from-intrinsics.ll | 2 +- .../TableGen/Basic/CodeGenIntrinsics.cpp | 2 + llvm/utils/TableGen/Basic/CodeGenIntrinsics.h | 3 + .../utils/TableGen/Basic/IntrinsicEmitter.cpp | 8 +- .../test/Target/LLVMIR/llvmir-intrinsics.mlir | 2 +- 31 files changed, 406 insertions(+), 418 deletions(-) diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c index a56f8ba1ee385..c7cd9ffdd6966 100644 --- a/clang/test/CodeGen/X86/math-builtins.c +++ b/clang/test/CodeGen/X86/math-builtins.c @@ -118,36 +118,36 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_copysign(f,f); __builtin_copysignf(f,f); __builtin_copysignl(f,f); __builtin_copysignf128(f,f); -// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]] -// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]] +// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]] +// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]] __builtin_fabs(f); __builtin_fabsf(f); __builtin_fabsl(f); __builtin_fabsf128(f); -// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_frexp(f,i); __builtin_frexpf(f,i); __builtin_frexpl(f,i); __builtin_frexpf128(f,i); -// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]] __builtin_huge_val(); __builtin_huge_valf(); __builtin_huge_vall(); __builtin_huge_valf128(); @@ -165,10 +165,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_ldexp(f,f); __builtin_ldexpf(f,f); __builtin_ldexpl(f,f); __builtin_ldexpf128(f,f); -// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]] @@ -180,7 +180,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]] -// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]] // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]] @@ -209,10 +209,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_pow(f,f); __builtin_powf(f,f); __builtin_powl(f,f); __builtin_powf128(f,f); -// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]] @@ -220,12 +220,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_powi(f,f); __builtin_powif(f,f); __builtin_powil(f,f); -// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]] /* math */ __builtin_acos(f); __builtin_acosf(f); __builtin_acosl(f); __builtin_acosf128(f); @@ -307,21 +307,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_ceil(f); __builtin_ceilf(f); __builtin_ceill(f); __builtin_ceilf128(f); -// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_cos(f); __builtin_cosf(f); __builtin_cosl(f); __builtin_cosf128(f); -// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]] @@ -362,10 +362,10 @@ __builtin_erfc(f); __builtin_erfcf(f); __builtin_erfcl(f); __builtin_ __builtin_exp(f); __builtin_expf(f); __builtin_expl(f); __builtin_expf128(f); -// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]] @@ -373,10 +373,10 @@ __builtin_exp(f); __builtin_expf(f); __builtin_expl(f); __builtin_e __builtin_exp2(f); __builtin_exp2f(f); __builtin_exp2l(f); __builtin_exp2f128(f); -// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]] @@ -384,10 +384,10 @@ __builtin_exp2(f); __builtin_exp2f(f); __builtin_exp2l(f); __builtin_ __builtin_exp10(f); __builtin_exp10f(f); __builtin_exp10l(f); __builtin_exp10f128(f); -// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @exp10(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @exp10f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @exp10l(x86_fp80 noundef) [[NOT_READNONE]] @@ -417,22 +417,22 @@ __builtin_fdim(f,f); __builtin_fdimf(f,f); __builtin_fdiml(f,f); __bu __builtin_floor(f); __builtin_floorf(f); __builtin_floorl(f); __builtin_floorf128(f); -// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_fma(f,f,f); __builtin_fmaf(f,f,f); __builtin_fmal(f,f,f); __builtin_fmaf128(f,f,f); __builtin_fmaf16(f,f,f); -// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC]] -// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC2]] +// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]] @@ -454,25 +454,25 @@ __builtin_fma(f,f,f); __builtin_fmaf(f,f,f); __builtin_fmal(f,f,f); __builtin_fmax(f,f); __builtin_fmaxf(f,f); __builtin_fmaxl(f,f); __builtin_fmaxf128(f,f); -// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]] __builtin_fmin(f,f); __builtin_fminf(f,f); __builtin_fminl(f,f); __builtin_fminf128(f,f); -// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]] __builtin_hypot(f,f); __builtin_hypotf(f,f); __builtin_hypotl(f,f); __builtin_hypotf128(f,f); @@ -509,10 +509,10 @@ __builtin_lgamma(f); __builtin_lgammaf(f); __builtin_lgammal(f); __builti __builtin_llrint(f); __builtin_llrintf(f); __builtin_llrintl(f); __builtin_llrintf128(f); -// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]] @@ -520,10 +520,10 @@ __builtin_llrint(f); __builtin_llrintf(f); __builtin_llrintl(f); __builti __builtin_llround(f); __builtin_llroundf(f); __builtin_llroundl(f); __builtin_llroundf128(f); -// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]] @@ -531,10 +531,10 @@ __builtin_llround(f); __builtin_llroundf(f); __builtin_llroundl(f); __built __builtin_log(f); __builtin_logf(f); __builtin_logl(f); __builtin_logf128(f); -// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]] @@ -542,10 +542,10 @@ __builtin_log(f); __builtin_logf(f); __builtin_logl(f); __builtin_l __builtin_log10(f); __builtin_log10f(f); __builtin_log10l(f); __builtin_log10f128(f); -// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]] @@ -564,10 +564,10 @@ __builtin_log1p(f); __builtin_log1pf(f); __builtin_log1pl(f); __builtin __builtin_log2(f); __builtin_log2f(f); __builtin_log2l(f); __builtin_log2f128(f); -// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]] @@ -586,10 +586,10 @@ __builtin_logb(f); __builtin_logbf(f); __builtin_logbl(f); __builtin_ __builtin_lrint(f); __builtin_lrintf(f); __builtin_lrintl(f); __builtin_lrintf128(f); -// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]] @@ -597,10 +597,10 @@ __builtin_lrint(f); __builtin_lrintf(f); __builtin_lrintl(f); __builtin __builtin_lround(f); __builtin_lroundf(f); __builtin_lroundl(f); __builtin_lroundf128(f); -// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]] @@ -608,14 +608,14 @@ __builtin_lround(f); __builtin_lroundf(f); __builtin_lroundl(f); __built __builtin_nearbyint(f); __builtin_nearbyintf(f); __builtin_nearbyintl(f); __builtin_nearbyintf128(f); -// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_nextafter(f,f); __builtin_nextafterf(f,f); __builtin_nextafterl(f,f); __builtin_nextafterf128(f,f); @@ -663,25 +663,25 @@ __builtin_remquo(f,f,i); __builtin_remquof(f,f,i); __builtin_remquol(f,f,i); __ __builtin_rint(f); __builtin_rintf(f); __builtin_rintl(f); __builtin_rintf128(f); -// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_round(f); __builtin_roundf(f); __builtin_roundl(f); __builtin_roundf128(f); -// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_scalbln(f,f); __builtin_scalblnf(f,f); __builtin_scalblnl(f,f); __builtin_scalblnf128(f,f); @@ -707,10 +707,10 @@ __builtin_scalbn(f,f); __builtin_scalbnf(f,f); __builtin_scalbnl(f,f); __ __builtin_sin(f); __builtin_sinf(f); __builtin_sinl(f); __builtin_sinf128(f); -// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]] @@ -747,10 +747,10 @@ __builtin_sincospi(f,d,d); __builtin_sincospif(f,fp,fp); __builtin_sincospil(f,l __builtin_sqrt(f); __builtin_sqrtf(f); __builtin_sqrtl(f); __builtin_sqrtf128(f); -// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]] @@ -791,22 +791,24 @@ __builtin_tgamma(f); __builtin_tgammaf(f); __builtin_tgammal(f); __builti __builtin_trunc(f); __builtin_truncf(f); __builtin_truncl(f); __builtin_truncf128(f); -// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]] }; // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} } +// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} } // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} } // NO__ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} } // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} } // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} } +// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} } // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} } // HAS_ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} } // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} } diff --git a/clang/test/CodeGen/builtin-sqrt.c b/clang/test/CodeGen/builtin-sqrt.c index 2313a68d2d0e2..3ebf2ac91ccdf 100644 --- a/clang/test/CodeGen/builtin-sqrt.c +++ b/clang/test/CodeGen/builtin-sqrt.c @@ -11,5 +11,5 @@ float foo(float X) { // HAS_ERRNO-NOT: attributes [[ATTR]] = {{{.*}} memory(none) // NO_ERRNO: declare float @llvm.sqrt.f32(float) [[ATTR:#[0-9]+]] -// NO_ERRNO: attributes [[ATTR]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +// NO_ERRNO: attributes [[ATTR]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/clang/test/CodeGen/libcalls.c b/clang/test/CodeGen/libcalls.c index 1e4b06e34aaf9..923719f3ec8e4 100644 --- a/clang/test/CodeGen/libcalls.c +++ b/clang/test/CodeGen/libcalls.c @@ -74,9 +74,9 @@ void test_fma(float a0, double a1, long double a2) { // CHECK-YES: declare float @fmaf(float noundef, float noundef, float noundef) // CHECK-YES: declare double @fma(double noundef, double noundef, double noundef) // CHECK-YES: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) -// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RN2:#[0-9]+]] -// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RN2]] -// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RN2]] +// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RNI]] +// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RNI]] +// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RNI]] // Just checking to make sure these library functions are marked readnone void test_builtins(double d, float f, long double ld) { @@ -85,9 +85,9 @@ void test_builtins(double d, float f, long double ld) { double atan_ = atan(d); long double atanl_ = atanl(ld); float atanf_ = atanf(f); -// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RNI:#[0-9]+]] -// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RNI]] -// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RNI]] +// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RN2:#[0-9]+]] +// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RN2]] +// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RN2]] // CHECK-YES: declare double @atan(double noundef) [[NUW:#[0-9]+]] // CHECK-YES: declare x86_fp80 @atanl(x86_fp80 noundef) [[NUW]] // CHECK-YES: declare float @atanf(float noundef) [[NUW]] @@ -95,9 +95,9 @@ void test_builtins(double d, float f, long double ld) { double atan2_ = atan2(d, 2); long double atan2l_ = atan2l(ld, ld); float atan2f_ = atan2f(f, f); -// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RNI]] -// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RNI]] -// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RNI]] +// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RN2]] +// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RN2]] +// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RN2]] // CHECK-YES: declare double @atan2(double noundef, double noundef) [[NUW]] // CHECK-YES: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NUW]] // CHECK-YES: declare float @atan2f(float noundef, float noundef) [[NUW]] @@ -124,4 +124,4 @@ void test_builtins(double d, float f, long double ld) { } // CHECK-YES: attributes [[NUW]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" } -// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c index ad297828f48ed..d4cd6f86b3c51 100644 --- a/clang/test/CodeGen/math-libcalls.c +++ b/clang/test/CodeGen/math-libcalls.c @@ -35,27 +35,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { copysign(f,f); copysignf(f,f);copysignl(f,f); - // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]] - // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]] - // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]] - // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] - // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]] - // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]] - // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]] + // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]] + // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] + // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]] + // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]] + // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] + // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]] + // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]] + // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] fabs(f); fabsf(f); fabsl(f); - // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]] - // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]] - // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]] - // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]] - // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]] - // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]] + // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]] + // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]] + // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]] + // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]] + // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]] + // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]] + // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]] + // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]] frexp(f,i); frexpf(f,i); frexpl(f,i); @@ -86,7 +86,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { // NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]] // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_MAYTRAP: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] @@ -107,9 +107,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { pow(f,f); powf(f,f); powl(f,f); -// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]] @@ -206,21 +206,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { ceil(f); ceilf(f); ceill(f); -// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ceil.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ceil.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ceil.f80( cos(f); cosf(f); cosl(f); -// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]] @@ -266,9 +266,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { exp(f); expf(f); expl(f); -// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]] @@ -278,9 +278,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { exp2(f); exp2f(f); exp2l(f); -// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]] @@ -314,21 +314,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { floor(f); floorf(f); floorl(f); -// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.floor.f64 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.floor.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.floor.f80( fma(f,f,f); fmaf(f,f,f); fmal(f,f,f); -// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]] @@ -350,39 +350,39 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { fmax(f,f); fmaxf(f,f); fmaxl(f,f); -// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.maxnum.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.maxnum.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.maxnum.f80( fmin(f,f); fminf(f,f); fminl(f,f); -// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.minnum.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.minnum.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.minnum.f80( fmaximum_num(*d,*d); fmaximum_numf(f,f); fmaximum_numl(*l,*l); -// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC]] -// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC]] -// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC2]] +// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC2]] +// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] fminimum_num(*d,*d); fminimum_numf(f,f); fminimum_numl(*l,*l); -// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC]] -// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC]] -// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC2]] +// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC2]] +// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] hypot(f,f); hypotf(f,f); hypotl(f,f); @@ -422,9 +422,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { llrint(f); llrintf(f); llrintl(f); -// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]] @@ -434,9 +434,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { llround(f); llroundf(f); llroundl(f); -// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]] @@ -446,9 +446,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { log(f); logf(f); logl(f); -// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]] @@ -458,9 +458,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { log10(f); log10f(f); log10l(f); -// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]] @@ -482,9 +482,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { log2(f); log2f(f); log2l(f); -// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]] @@ -506,9 +506,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { lrint(f); lrintf(f); lrintl(f); -// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]] @@ -518,9 +518,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { lround(f); lroundf(f); lroundl(f); -// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]] @@ -530,12 +530,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { nearbyint(f); nearbyintf(f); nearbyintl(f); -// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.nearbyint.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.nearbyint.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.nearbyint.f80( @@ -590,24 +590,24 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { rint(f); rintf(f); rintl(f); -// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.rint.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.rint.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.rint.f80( round(f); roundf(f); roundl(f); -// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.round.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.round.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.round.f80( @@ -638,9 +638,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { sin(f); sinf(f); sinl(f); -// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]] @@ -674,9 +674,9 @@ sincos(f, d, d); sincosf(f, fp, fp); sincosl(f, l, l); sqrt(f); sqrtf(f); sqrtl(f); -// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]] @@ -722,20 +722,22 @@ sincos(f, d, d); sincosf(f, fp, fp); sincosl(f, l, l); trunc(f); truncf(f); truncl(f); -// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]] }; // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} } +// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} } // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} } // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} } // NO__ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} } // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} } +// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} } // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} } // HAS_ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} } // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} } diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl index ea1f734391614..5cbf6452d4c85 100644 --- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl +++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl @@ -199,7 +199,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // SPIR32: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } // SPIR32: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } // SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } // SPIR32: attributes #[[ATTR4]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // SPIR32: attributes #[[ATTR5]] = { convergent nounwind "uniform-work-group-size"="true" } //. diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 3c089b5a0ba79..b9507a2d054fe 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -2741,6 +2741,11 @@ For example: ``"nooutline"`` This attribute indicates that outlining passes should not modify the function. +``nocreateundeforpoison`` + This attribute indicates that the result of the function (prior to + application of return attributes/metadata) will not be undef or poison if + all arguments are not undef and not poison. Otherwise, it is undefined + behavior. Call Site Attributes ---------------------- diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 464f475098ec5..b0c5beae631ce 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -801,6 +801,7 @@ enum AttributeKindCodes { ATTR_KIND_CAPTURES = 102, ATTR_KIND_DEAD_ON_RETURN = 103, ATTR_KIND_SANITIZE_ALLOC_TOKEN = 104, + ATTR_KIND_NO_CREATE_UNDEF_OR_POISON = 105, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index 8ce2b1bea8fac..c086a39616249 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -183,6 +183,11 @@ def NoCallback : EnumAttr<"nocallback", IntersectAnd, [FnAttr]>; /// Specify how the pointer may be captured. def Captures : IntAttr<"captures", IntersectCustom, [ParamAttr]>; +/// Result will not be undef or poison if all arguments are not undef and not +/// poison. +def NoCreateUndefOrPoison + : EnumAttr<"nocreateundeforpoison", IntersectAnd, [FnAttr]>; + /// Function is not a source of divergence. def NoDivergenceSource : EnumAttr<"nodivergencesource", IntersectAnd, [FnAttr]>; diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 4d59ee8676b9e..6a079f62dd9cf 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -186,6 +186,10 @@ def IntrSpeculatable : IntrinsicProperty; // defined by the hasSideEffects property of the TableGen Instruction class. def IntrHasSideEffects : IntrinsicProperty; +// Result will not be undef or poison if all arguments are not undef and not +// poison. +def IntrNoCreateUndefOrPoison : IntrinsicProperty; + //===----------------------------------------------------------------------===// // IIT constants and utils //===----------------------------------------------------------------------===// @@ -1039,7 +1043,7 @@ def int_experimental_memset_pattern // FIXME: Add version of these floating point intrinsics which allow non-default // rounding modes and FP exception handling. -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_fma : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; @@ -1052,16 +1056,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { // environment so they can be treated as readnone. def int_sqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_powi : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyint_ty]>; - def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; def int_sin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_cos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_tan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_pow : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; def int_log : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; @@ -1080,12 +1076,6 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { def int_nearbyint : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_round : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_roundeven : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_anyfloat_ty]>; - def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_anyfloat_ty]>; - def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_anyfloat_ty]>; // Truncate a floating point number with a specific rounding mode def int_fptrunc_round : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], @@ -1097,6 +1087,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { def int_arithmetic_fence : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + // If the value doesn't fit an unspecified value is returned, but this + // is not poison so we can still mark these as IntrNoCreateUndefOrPoison. def int_lround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_llround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_lrint : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; @@ -1110,29 +1102,50 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { def int_frexp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty, llvm_anyint_ty], [LLVMMatchType<0>]>; } +// TODO: Move all of these into the IntrNoCreateUndefOrPoison case above. +let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + // These functions do not read memory, but are sensitive to the + // rounding mode. LLVM purposely does not model changes to the FP + // environment so they can be treated as readnone. + def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; + def int_tan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_anyfloat_ty]>; + def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_anyfloat_ty]>; + def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_anyfloat_ty]>; +} + def int_minnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_maxnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_minimum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_maximum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_minimumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_maximumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; // Internal interface for object size checking @@ -1164,7 +1177,7 @@ let IntrProperties = [IntrInaccessibleMemOnly] in { def int_is_fpclass : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyfloat_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, ImmArg<ArgIndex<1>>]>; //===--------------- Constrained Floating Point Intrinsics ----------------===// // @@ -1406,7 +1419,7 @@ def int_expect_with_probability : DefaultAttrsIntrinsic<[llvm_anyint_ty], // // None of these intrinsics accesses memory at all. -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_bswap: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>; def int_ctpop: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>; def int_bitreverse : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>; @@ -1521,7 +1534,7 @@ def int_adjust_trampoline : DefaultAttrsIntrinsic< // // Expose the carry flag from add operations on two integrals. -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_sadd_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMMatchType<0>, LLVMMatchType<0>]>; @@ -1547,16 +1560,16 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { // def int_sadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>; def int_uadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>; def int_ssub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_usub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_sshl_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; @@ -1611,22 +1624,22 @@ def int_abs : DefaultAttrsIntrinsic< def int_smax : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_smin : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_umax : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_umin : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_scmp : DefaultAttrsIntrinsic< [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>], - [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>; def int_ucmp : DefaultAttrsIntrinsic< [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>], - [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>; //===------------------------- Memory Use Markers -------------------------===// // @@ -1868,7 +1881,7 @@ def int_convert_from_fp16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i16_ } // Saturating floating point to integer intrinsics -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_fptoui_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_fptosi_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; } @@ -1890,7 +1903,7 @@ def int_fake_use : DefaultAttrsIntrinsic<[], [llvm_vararg_ty], // First argument must be pointer or vector of pointer. This is checked by the // verifier. def int_ptrmask: DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_anyint_ty], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; // Intrinsic to wrap a thread local variable. def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>], diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0a72076f51824..523374bdc472f 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7419,84 +7419,20 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind, if (cast<ConstantInt>(II->getArgOperand(1))->isNullValue()) return false; break; - case Intrinsic::ctpop: - case Intrinsic::bswap: - case Intrinsic::bitreverse: - case Intrinsic::fshl: - case Intrinsic::fshr: - case Intrinsic::smax: - case Intrinsic::smin: - case Intrinsic::scmp: - case Intrinsic::umax: - case Intrinsic::umin: - case Intrinsic::ucmp: - case Intrinsic::ptrmask: - case Intrinsic::fptoui_sat: - case Intrinsic::fptosi_sat: - case Intrinsic::sadd_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::uadd_with_overflow: - case Intrinsic::usub_with_overflow: - case Intrinsic::umul_with_overflow: - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - return false; case Intrinsic::sshl_sat: case Intrinsic::ushl_sat: - return includesPoison(Kind) && - !shiftAmountKnownInRange(II->getArgOperand(1)); - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::sqrt: - case Intrinsic::powi: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::pow: - case Intrinsic::log: - case Intrinsic::log10: - case Intrinsic::log2: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::exp10: - case Intrinsic::fabs: - case Intrinsic::copysign: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::round: - case Intrinsic::roundeven: - case Intrinsic::fptrunc_round: - case Intrinsic::canonicalize: - case Intrinsic::arithmetic_fence: - case Intrinsic::minnum: - case Intrinsic::maxnum: - case Intrinsic::minimum: - case Intrinsic::maximum: - case Intrinsic::minimumnum: - case Intrinsic::maximumnum: - case Intrinsic::is_fpclass: - case Intrinsic::ldexp: - case Intrinsic::frexp: - return false; - case Intrinsic::lround: - case Intrinsic::llround: - case Intrinsic::lrint: - case Intrinsic::llrint: - // If the value doesn't fit an unspecified value is returned (but this - // is not poison). - return false; + if (!includesPoison(Kind) || + shiftAmountKnownInRange(II->getArgOperand(1))) + return false; + break; } } [[fallthrough]]; case Instruction::CallBr: case Instruction::Invoke: { const auto *CB = cast<CallBase>(Op); - return !CB->hasRetAttr(Attribute::NoUndef); + return !CB->hasRetAttr(Attribute::NoUndef) && + !CB->hasFnAttr(Attribute::NoCreateUndefOrPoison); } case Instruction::InsertElement: case Instruction::ExtractElement: { diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 1bdf3e9f684f5..8930d64de5e37 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2257,6 +2257,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::Captures; case bitc::ATTR_KIND_DEAD_ON_RETURN: return Attribute::DeadOnReturn; + case bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON: + return Attribute::NoCreateUndefOrPoison; } } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index f17656c7c3b03..76494c792ac7b 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -956,6 +956,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_CAPTURES; case Attribute::DeadOnReturn: return bitc::ATTR_KIND_DEAD_ON_RETURN; + case Attribute::NoCreateUndefOrPoison: + return bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 5ba6f95f5fae8..608661583c3db 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::CoroDestroyOnlyWhenComplete: case Attribute::CoroElideSafe: case Attribute::NoDivergenceSource: + case Attribute::NoCreateUndefOrPoison: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll index aef7810fe2c3b..107a98aebeeb8 100644 --- a/llvm/test/Bitcode/attributes.ll +++ b/llvm/test/Bitcode/attributes.ll @@ -521,6 +521,11 @@ define void @f_sanitize_alloc_token() sanitize_alloc_token { ret void; } +; CHECK: define void @f_no_create_undef_or_poison() #56 +define void @f_no_create_undef_or_poison() nocreateundeforpoison { + ret void; +} + ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]] define void @f87() fn_ret_thunk_extern { ret void } @@ -633,6 +638,7 @@ define void @dead_on_return(ptr dead_on_return %p) { ; CHECK: attributes #53 = { sanitize_realtime } ; CHECK: attributes #54 = { sanitize_realtime_blocking } ; CHECK: attributes #55 = { sanitize_alloc_token } +; CHECK: attributes #56 = { nocreateundeforpoison } ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern } ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile } ; CHECK: attributes [[OPTDEBUG]] = { optdebug } diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll index 71c6380177b3a..8a0ac6d4ace7a 100644 --- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll +++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll @@ -780,6 +780,7 @@ define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) #0 { attributes #0 = { "target-features"="+sve" } ;. -; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR1]] = { "target-features"="+sve" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll index f6ae516faf2f7..89d039406f375 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll @@ -1489,7 +1489,7 @@ attributes #2 = { noinline } !0 = !{float 3.0} ;. ; CHECK: attributes #[[ATTR0]] = { strictfp } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) } ; CHECK: attributes #[[ATTR3]] = { noinline } ; CHECK: attributes #[[ATTR4]] = { nobuiltin } diff --git a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll index c489bc3681876..aa63552eb4b63 100644 --- a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll +++ b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll @@ -488,5 +488,5 @@ declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>) declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>) ;. ; CHECK: attributes #[[ATTR0]] = { "target-features"="+v" } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" } ;. diff --git a/llvm/test/Feature/intrinsics.ll b/llvm/test/Feature/intrinsics.ll index b6abc0fff6db7..a2da8f29116e1 100644 --- a/llvm/test/Feature/intrinsics.ll +++ b/llvm/test/Feature/intrinsics.ll @@ -64,7 +64,7 @@ define void @libm() { ; FIXME: test ALL the intrinsics in this file. -; CHECK: declare void @llvm.trap() #1 +; CHECK: declare void @llvm.trap() #2 declare void @llvm.trap() define void @trap() { @@ -72,5 +72,4 @@ define void @trap() { ret void } -; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; CHECK: attributes #1 = { cold noreturn nounwind memory(inaccessiblemem: write) } +; CHECK: attributes #2 = { cold noreturn nounwind memory(inaccessiblemem: write) } diff --git a/llvm/test/Linker/drop-attribute.ll b/llvm/test/Linker/drop-attribute.ll index 9be95a89109b4..3d4c13c2ffc75 100644 --- a/llvm/test/Linker/drop-attribute.ll +++ b/llvm/test/Linker/drop-attribute.ll @@ -39,7 +39,7 @@ define void @test_nocallback_definition() nocallback { declare void @test_nocallback_call_site() ; Test that checks that nocallback attribute on an intrinsic is NOT dropped. -; CHECK: ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn +; CHECK: ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn ; CHECK-NEXT: declare float @llvm.sqrt.f32(float) #0 declare float @llvm.sqrt.f32(float) nocallback diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll index 2a9d5d91ae053..94aa79aa327f4 100644 --- a/llvm/test/Transforms/Attributor/nofree.ll +++ b/llvm/test/Transforms/Attributor/nofree.ll @@ -238,7 +238,7 @@ define void @call_both() #0 { ; TEST 10 (positive case) ; Call intrinsic function -; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.floor.f32(float) define void @call_floor(float %a) #0 { @@ -489,7 +489,7 @@ attributes #2 = { nobuiltin nounwind } ; TUNIT: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable } ; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable } ; TUNIT: attributes #[[ATTR5:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable } -; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; TUNIT: attributes #[[ATTR7]] = { nofree nounwind } ; TUNIT: attributes #[[ATTR8]] = { nobuiltin nofree nounwind } ; TUNIT: attributes #[[ATTR9]] = { nosync memory(none) } @@ -506,7 +506,7 @@ attributes #2 = { nobuiltin nounwind } ; CGSCC: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable } ; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable } ; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable } -; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CGSCC: attributes #[[ATTR7]] = { nofree nounwind } ; CGSCC: attributes #[[ATTR8]] = { nobuiltin nofree nounwind } ; CGSCC: attributes #[[ATTR9]] = { nosync memory(none) } diff --git a/llvm/test/Transforms/Attributor/nosync.ll b/llvm/test/Transforms/Attributor/nosync.ll index 7ef46e8e94c9e..c15bd775ddb4d 100644 --- a/llvm/test/Transforms/Attributor/nosync.ll +++ b/llvm/test/Transforms/Attributor/nosync.ll @@ -454,7 +454,7 @@ define void @nosync_convergent_callee_test() { ; CHECK: attributes #[[ATTR14:[0-9]+]] = { convergent memory(none) } ; CHECK: attributes #[[ATTR15]] = { memory(none) } ; CHECK: attributes #[[ATTR16]] = { nounwind } -; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR18]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR19]] = { nosync memory(none) } ; CHECK: attributes #[[ATTR20]] = { nofree nounwind } diff --git a/llvm/test/Transforms/Attributor/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll index b7ac7fc2970b0..d65480b05759a 100644 --- a/llvm/test/Transforms/Attributor/willreturn.ll +++ b/llvm/test/Transforms/Attributor/willreturn.ll @@ -276,7 +276,7 @@ define void @conditional_exit(i32 %0, ptr nocapture readonly %1) local_unnamed_a ; TEST 6 (positive case) ; Call intrinsic function -; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.floor.f32(float) define void @call_floor(float %a) #0 { @@ -425,7 +425,7 @@ define i32 @loop_constant_trip_count(ptr nocapture readonly %0) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP1:%.*]] ], [ [[TMP9:%.*]], [[TMP3]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ 0, [[TMP1]] ], [ [[TMP8]], [[TMP3]] ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !invariant.load [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP8]] = add nsw i32 [[TMP7]], [[TMP5]] ; CHECK-NEXT: [[TMP9]] = add nuw nsw i64 [[TMP4]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 10 @@ -472,7 +472,7 @@ define i32 @loop_trip_count_unbound(i32 %0, i32 %1, ptr nocapture readonly %2, i ; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP14]], [[TMP8]] ], [ 0, [[TMP4]] ] ; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TMP14]] = add nsw i32 [[TMP13]], [[TMP10]] ; CHECK-NEXT: [[TMP15]] = add i32 [[TMP9]], [[TMP3]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]] @@ -522,7 +522,7 @@ define i32 @loop_trip_dec(i32 %0, ptr nocapture readonly %1) local_unnamed_addr ; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ [[TMP5]], [[TMP4]] ], [ [[TMP12:%.*]], [[TMP6]] ] ; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ 0, [[TMP4]] ], [ [[TMP11:%.*]], [[TMP6]] ] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TMP11]] = add nsw i32 [[TMP10]], [[TMP8]] ; CHECK-NEXT: [[TMP12]] = add nsw i64 [[TMP7]], -1 ; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt i64 [[TMP7]], 0 @@ -1294,7 +1294,7 @@ attributes #1 = { uwtable noinline } ; TUNIT: attributes #[[ATTR5]] = { noreturn } ; TUNIT: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable } ; TUNIT: attributes #[[ATTR7]] = { noinline nounwind uwtable } -; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; TUNIT: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn } ; TUNIT: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable } ; TUNIT: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable } @@ -1332,7 +1332,7 @@ attributes #1 = { uwtable noinline } ; CGSCC: attributes #[[ATTR5]] = { noreturn } ; CGSCC: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable } ; CGSCC: attributes #[[ATTR7]] = { noinline nounwind uwtable } -; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CGSCC: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn } ; CGSCC: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable } ; CGSCC: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable } @@ -1364,3 +1364,7 @@ attributes #1 = { uwtable noinline } ; CGSCC: attributes #[[ATTR37]] = { nosync willreturn memory(read) } ; CGSCC: attributes #[[ATTR38]] = { willreturn memory(read) } ;. +; TUNIT: [[META0]] = !{} +;. +; CGSCC: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll index cf871e5714bf5..1dbffd962a638 100644 --- a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll +++ b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll @@ -262,7 +262,7 @@ define i32 @commutative_intrinsic_intersection_failure(i32 %arg, i32 %arg1) { } ;. -; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR1]] = { memory(none) } ; CHECK: attributes #[[ATTR2]] = { memory(read) } ; CHECK: attributes #[[ATTR3]] = { alwaysinline memory(none) } diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll index e8ea912246728..9deccf5352ea8 100644 --- a/llvm/test/Transforms/LoopIdiom/basic.ll +++ b/llvm/test/Transforms/LoopIdiom/basic.ll @@ -1620,5 +1620,5 @@ define noalias ptr @_ZN8CMSPULog9beginImplEja(ptr nocapture writeonly %0) local_ ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } -; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll index 09f583f9242d5..3416584729317 100644 --- a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll @@ -38,5 +38,5 @@ define <4 x float> @fixed_vec_exp(<4 x float> %input) { declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0 declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>) #0 -; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll index d1fba91d1e505..169803f7aa012 100644 --- a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll +++ b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll @@ -321,7 +321,7 @@ three: !1 = !{!"branch_weights", i32 5, i32 7, i32 11, i32 13, i32 17} ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { optsize } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ;. ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100} ; CHECK: [[PROF1]] = !{!"branch_weights", i32 48, i32 5} diff --git a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll index a6ace2246fa8e..b800f9aa97c8f 100644 --- a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll +++ b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll @@ -26,7 +26,7 @@ define i32 @t(i32 %a) { ; CHECK-ALL: declare i32 @llvm.uadd.sat.i32(i32, i32) #0 declare i32 @llvm.uadd.sat.i32(i32, i32) #0 -; CHECK-ALL: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK-ALL: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK-INTERESTINGNESS: attributes #1 = { ; CHECK-INTERESTINGNESS-SAME: "arg4" diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp index cd866469792a2..ff894853b9771 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp @@ -407,6 +407,8 @@ void CodeGenIntrinsic::setProperty(const Record *R) { hasSideEffects = true; else if (R->getName() == "IntrStrictFP") isStrictFP = true; + else if (R->getName() == "IntrNoCreateUndefOrPoison") + isNoCreateUndefOrPoison = true; else if (R->isSubClassOf("NoCapture")) { unsigned ArgNo = R->getValueAsInt("ArgNo"); addArgAttribute(ArgNo, NoCapture); diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h index 2e86149514f46..15e803c4feba1 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h @@ -114,6 +114,9 @@ struct CodeGenIntrinsic { // True if the intrinsic is marked as strictfp. bool isStrictFP = false; + // True if the intrinsic is marked as IntrNoCreateUndefOrPoison. + bool isNoCreateUndefOrPoison = false; + enum ArgAttrKind { NoCapture, NoAlias, diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp index 75dffb18fca5a..452d2b08f25c3 100644 --- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp @@ -421,7 +421,8 @@ static bool compareFnAttributes(const CodeGenIntrinsic *L, return std::tie(I->canThrow, I->isNoDuplicate, I->isNoMerge, I->isNoReturn, I->isNoCallback, I->isNoSync, I->isNoFree, I->isWillReturn, I->isCold, I->isConvergent, I->isSpeculatable, - I->hasSideEffects, I->isStrictFP); + I->hasSideEffects, I->isStrictFP, + I->isNoCreateUndefOrPoison); }; auto TieL = TieBoolAttributes(L); @@ -446,7 +447,8 @@ static bool hasFnAttributes(const CodeGenIntrinsic &Int) { return !Int.canThrow || Int.isNoReturn || Int.isNoCallback || Int.isNoSync || Int.isNoFree || Int.isWillReturn || Int.isCold || Int.isNoDuplicate || Int.isNoMerge || Int.isConvergent || Int.isSpeculatable || - Int.isStrictFP || getEffectiveME(Int) != MemoryEffects::unknown(); + Int.isStrictFP || Int.isNoCreateUndefOrPoison || + getEffectiveME(Int) != MemoryEffects::unknown(); } namespace { @@ -605,6 +607,8 @@ static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) { addAttribute("Speculatable"); if (Int.isStrictFP) addAttribute("StrictFP"); + if (Int.isNoCreateUndefOrPoison) + addAttribute("NoCreateUndefOrPoison"); const MemoryEffects ME = getEffectiveME(Int); if (ME != MemoryEffects::unknown()) { diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index 60bd24a27868e..1e4cf8d4589cb 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -1308,7 +1308,7 @@ llvm.func @experimental_constrained_fpext(%s: f32, %v: vector<4xf32>) { // CHECK-DAG: declare float @llvm.cos.f32(float) // CHECK-DAG: declare <8 x float> @llvm.cos.v8f32(<8 x float>) #0 // CHECK-DAG: declare { float, float } @llvm.sincos.f32(float) -// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>) #0 +// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>) // CHECK-DAG: declare float @llvm.copysign.f32(float, float) // CHECK-DAG: declare float @llvm.rint.f32(float) // CHECK-DAG: declare double @llvm.rint.f64(double) From 0a95a86634a8a8d2e95a390828e345058b4a723f Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com> Date: Tue, 4 Nov 2025 12:02:58 +0000 Subject: [PATCH 151/313] [VPlan] Fix first-lane comment in sinkScalarOperands (NFC) (#166347) To follow-up on a post-commit review. --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e8e87aa8a4f3c..2588c878d8472 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -198,8 +198,8 @@ static bool sinkScalarOperands(VPlan &Plan) { VPSingleDefRecipe *SinkCandidate; std::tie(SinkTo, SinkCandidate) = WorkList[I]; - // All recipe users of the sink candidate must be in the same block SinkTo - // or all users outside of SinkTo must have only their first lane used. In + // All recipe users of SinkCandidate must be in the same block SinkTo or all + // users outside of SinkTo must only use the first lane of SinkCandidate. In // the latter case, we need to duplicate SinkCandidate. auto UsersOutsideSinkTo = make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { From dbce71382c65450edd7e02d62268e8b7dfcb7ed6 Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.foad@amd.com> Date: Tue, 4 Nov 2025 12:03:16 +0000 Subject: [PATCH 152/313] [AMDGPU] Skip debug instructions when eliminating S_SET_GPR_IDX_ON/OFF (#160715) --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 2 +- .../CodeGen/AMDGPU/set-gpr-idx-peephole.mir | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 7431e111ec862..abefa32b8f802 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), E = MI.getIterator(); I != E; ++I) { - if (I->isBundle()) + if (I->isBundle() || I->isDebugInstr()) continue; switch (I->getOpcode()) { case AMDGPU::S_SET_GPR_IDX_MODE: diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir index 002d43f937837..131656975ec40 100644 --- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir +++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX --- @@ -40,6 +41,27 @@ body: | S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ... +--- +name: meta_in_between +body: | + bb.0: + ; GCN-LABEL: name: meta_in_between + ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode + ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: KILL $sgpr0 + ; GCN-NEXT: $sgpr0 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode + $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + KILL $sgpr0 + $sgpr0 = IMPLICIT_DEF + S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode + $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode +... + --- name: valu_write_in_between body: | From 8636c40f4e5451fd9a6b64c22abe0e8d561f7c89 Mon Sep 17 00:00:00 2001 From: Michael Buch <michaelbuch12@gmail.com> Date: Tue, 4 Nov 2025 12:01:15 +0000 Subject: [PATCH 153/313] [lldb][test] Skip registration-unique.test on Windows Fails with: ``` RUN: at line 4 split-file C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp executed command: split-file 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp' note: command had no output on stdout or stderr RUN: at line 6 c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\clang.exe --target=specify-a-target-or-use-a-_host-substitution --target=aarch64-pc-windows-msvc -fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/main.cpp -g -o C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\clang.exe' --target=specify-a-target-or-use-a-_host-substitution --target=aarch64-pc-windows-msvc '-fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/main.cpp' -g -o 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out' .---command stderr------------ | clang: warning: argument unused during compilation: '-fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell' [-Wunused-command-line-argument] `----------------------------- RUN: at line 7 c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\lldb.exe --no-lldbinit -S C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/tools/lldb\test\Shell\lit-lldb-init-quiet -b -s C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/commands.input C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out | c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\filecheck.exe C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\lldb.exe' --no-lldbinit -S 'C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/tools/lldb\test\Shell\lit-lldb-init-quiet' -b -s 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/commands.input' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out' note: command had no output on stdout or stderr executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\filecheck.exe' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test' .---command stderr------------ | C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test:45:10: error: CHECK: expected string not found in input | # CHECK: Assert StackFrame Recognizer | ^ | <stdin>:20:38: note: scanning from here | 1: Verbose Trap StackFrame Recognizer, demangled symbol regex ^__clang_trap_msg | ^ | <stdin>:34:10: note: possible intended match here | 3: Verbose Trap StackFrame Recognizer, demangled symbol regex ^__clang_trap_msg | ^ | | Input file: <stdin> | Check file: C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test | | -dump-input=help explains the following input dump. ``` --- lldb/test/Shell/Recognizer/registration-unique.test | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/test/Shell/Recognizer/registration-unique.test b/lldb/test/Shell/Recognizer/registration-unique.test index bc1e4a6ea235b..34400d9a27575 100644 --- a/lldb/test/Shell/Recognizer/registration-unique.test +++ b/lldb/test/Shell/Recognizer/registration-unique.test @@ -1,3 +1,5 @@ +# UNSUPPORTED: system-windows + # Checks that the recognizers that should work across language runtimes # are only registered once with the target. From 51269e220da64637b780791a28fb187cbc36084d Mon Sep 17 00:00:00 2001 From: Haojian Wu <hokein.wu@gmail.com> Date: Tue, 4 Nov 2025 13:01:08 +0100 Subject: [PATCH 154/313] [bazel] Fix bazel build for bb4ed55acdbc7f48bc978147189e8106e3ea42f8 --- utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index ae9ff2878e03b..3ded8bdf89514 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -80,6 +80,7 @@ cc_library( hdrs = glob(["LanguageRuntime/CPlusPlus/*.h"]), includes = [".."], deps = [ + "//clang:codegen", "//lldb:CoreHeaders", "//lldb:Headers", "//lldb:SymbolHeaders", From bea31dd373e3f053f0b3f1862c6b106831e1f25d Mon Sep 17 00:00:00 2001 From: Karlo Basioli <basioli@google.com> Date: Tue, 4 Nov 2025 13:09:32 +0100 Subject: [PATCH 155/313] Fix bazel build issue after #166157 (#166358) --- utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index 3ded8bdf89514..81d4894e2bf1e 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -87,6 +87,7 @@ cc_library( "//lldb:TargetHeaders", "//lldb:Utility", "//llvm:Support", + "//clang:codegen", ], ) From 8cb0c0ce1be95e0d0a9b2f366844d1a4e11d5268 Mon Sep 17 00:00:00 2001 From: Karlo Basioli <basioli@google.com> Date: Tue, 4 Nov 2025 13:11:21 +0100 Subject: [PATCH 156/313] Revert "Fix bazel build issue after #166157" (#166359) Reverts llvm/llvm-project#166358 as this was fixed by 51269e220da64637b780791a28fb187cbc36084d --- utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index 81d4894e2bf1e..3ded8bdf89514 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -87,7 +87,6 @@ cc_library( "//lldb:TargetHeaders", "//lldb:Utility", "//llvm:Support", - "//clang:codegen", ], ) From 28a20b4af94465155f64228746adc07a82368687 Mon Sep 17 00:00:00 2001 From: Julian Nagele <j.nagele@apple.com> Date: Tue, 4 Nov 2025 12:39:04 +0000 Subject: [PATCH 157/313] [VectorCombine] Avoid inserting freeze when scalarizing extend-extract if all extracts would lead to UB on poison. (#164683) This change aims to avoid inserting a freeze instruction between the load and bitcast when scalarizing extend-extract. This is particularly useful in combination with https://github.com/llvm/llvm-project/pull/164682, which can then potentially further scalarize, provided there is no freeze. alive2 proof: https://alive2.llvm.org/ce/z/W-GD88 --- .../Transforms/Vectorize/VectorCombine.cpp | 27 ++- .../VectorCombine/AArch64/ext-extract.ll | 186 ++++++++++++++++++ 2 files changed, 211 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d6eb00da11dc8..27a8bbd5776be 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) { Value *ScalarV = Ext->getOperand(0); if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV), - &DT)) - ScalarV = Builder.CreateFreeze(ScalarV); + &DT)) { + // Check wether all lanes are extracted, all extracts trigger UB + // on poison, and the last extract (and hence all previous ones) + // are guaranteed to execute if Ext executes. If so, we do not + // need to insert a freeze. + SmallDenseSet<ConstantInt *, 8> ExtractedLanes; + bool AllExtractsTriggerUB = true; + ExtractElementInst *LastExtract = nullptr; + BasicBlock *ExtBB = Ext->getParent(); + for (User *U : Ext->users()) { + auto *Extract = cast<ExtractElementInst>(U); + if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) { + AllExtractsTriggerUB = false; + break; + } + ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand())); + if (!LastExtract || LastExtract->comesBefore(Extract)) + LastExtract = Extract; + } + if (ExtractedLanes.size() != DstTy->getNumElements() || + !AllExtractsTriggerUB || + !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(), + LastExtract->getIterator())) + ScalarV = Builder.CreateFreeze(ScalarV); + } ScalarV = Builder.CreateBitCast( ScalarV, IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy))); diff --git a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll index 60700412686ea..e7b11cdf8475e 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll @@ -346,3 +346,189 @@ entry: call void @use.i32(i32 %ext.3) ret void } + +define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(<4 x i8> %src) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_all_lanes_used_no_freeze( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[SRC]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 24 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 255 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 255 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP3]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.2 = extractelement <4 x i32> %ext, i64 2 + %ext.3 = extractelement <4 x i32> %ext, i64 3 + + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.2 + %add3 = add i32 %add2, %ext.3 + ret i32 %add3 +} + +define noundef i32 @zext_v4i8_not_all_lanes_used(<4 x i8> %src) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_not_all_lanes_used( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[TMP2]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 24 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 255 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.3 = extractelement <4 x i32> %ext, i64 3 + + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.3 + ret i32 %add2 +} + +define i32 @zext_v4i8_all_lanes_used_no_ub(<4 x i8> %src) { +; CHECK-LABEL: define i32 @zext_v4i8_all_lanes_used_no_ub( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 255 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.2 = extractelement <4 x i32> %ext, i64 2 + %ext.3 = extractelement <4 x i32> %ext, i64 3 + + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.2 + %add3 = add i32 %add2, %ext.3 + ret i32 %add3 +} + +define noundef i32 @zext_v4i8_extracts_different_blocks(<4 x i8> %src, i1 %cond) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_extracts_different_blocks( +; CHECK-SAME: <4 x i8> [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 255 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP4]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[PHI]] +; CHECK-NEXT: ret i32 [[ADD2]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + br i1 %cond, label %then, label %else + +then: + %ext.2 = extractelement <4 x i32> %ext, i64 2 + br label %exit + +else: + %ext.3 = extractelement <4 x i32> %ext, i64 3 + br label %exit + +exit: + %phi = phi i32 [ %ext.2, %then ], [ %ext.3, %else ] + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %phi + ret i32 %add2 +} + + +declare void @may_throw() willreturn + +define noundef i32 @zext_v4i8_throwing_call_between(<4 x i8> %src) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_throwing_call_between( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 255 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: call void @may_throw() +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.2 = extractelement <4 x i32> %ext, i64 2 + call void @may_throw() + %ext.3 = extractelement <4 x i32> %ext, i64 3 + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.2 + %add3 = add i32 %add2, %ext.3 + ret i32 %add3 +} From 0307147105d569e45f1d5a7e81d128e90fd648be Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.voicu@amd.com> Date: Tue, 4 Nov 2025 14:45:53 +0200 Subject: [PATCH 158/313] [NFC][SPIRV] Add AMDGCN SPIR-V specific defaults to the BE (#165815) AMDGCN flavoured SPIR-V has slightly different defaults from what the BE adopts: it assumes all extensions are enabled, and expects nonsemantic debug info to be generated. Furthermore, it is necessary to encode in the resulting SPIR-V binary that what was generated was AMDGCN flavoured, which we do by setting the Generator Version to `UINT16_MAX` (which matches what we expect to see at reverse translation). We will register this generator version at <https://github.com/KhronosGroup/SPIRV-Headers>. This is a preliminary patch out of a series of patches that are needed for adopting the BE for AMDGCN flavoured SPIR-V generation. --- llvm/lib/MC/SPIRVObjectWriter.cpp | 7 +++++-- llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp | 11 +++++++++-- llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp | 3 ++- .../CodeGen/SPIRV/debug-info/debug-type-pointer.ll | 4 +++- llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll | 2 ++ .../CodeGen/SPIRV/extensions/enable-all-extensions.ll | 1 + .../SPIRV/physical-layout/generator-magic-number.ll | 2 ++ .../CodeGen/SPIRV/physical-layout/spirv-version.ll | 2 ++ 8 files changed, 26 insertions(+), 6 deletions(-) diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp index 5e3713778286f..d693ea33d8d7b 100644 --- a/llvm/lib/MC/SPIRVObjectWriter.cpp +++ b/llvm/lib/MC/SPIRVObjectWriter.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSPIRVObjectWriter.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCValue.h" @@ -17,8 +18,10 @@ using namespace llvm; void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) { constexpr uint32_t MagicNumber = 0x07230203; constexpr uint32_t GeneratorID = 43; - constexpr uint32_t GeneratorMagicNumber = - (GeneratorID << 16) | (LLVM_VERSION_MAJOR); + const uint32_t GeneratorMagicNumber = + Asm.getContext().getTargetTriple().getVendor() == Triple::AMD + ? UINT16_MAX + : ((GeneratorID << 16) | (LLVM_VERSION_MAJOR)); constexpr uint32_t Schema = 0; W.write<uint32_t>(MagicNumber); diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index ba09692fec515..ad6c9cd421b7c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -70,7 +70,6 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, SPIRVVersion = VersionTuple(1, 3); break; case Triple::SPIRVSubArch_v14: - default: SPIRVVersion = VersionTuple(1, 4); break; case Triple::SPIRVSubArch_v15: @@ -79,13 +78,19 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, case Triple::SPIRVSubArch_v16: SPIRVVersion = VersionTuple(1, 6); break; + default: + if (TT.getVendor() == Triple::AMD) + SPIRVVersion = VersionTuple(1, 6); + else + SPIRVVersion = VersionTuple(1, 4); } OpenCLVersion = VersionTuple(2, 2); // Set the environment based on the target triple. if (TargetTriple.getOS() == Triple::Vulkan) Env = Shader; - else if (TargetTriple.getEnvironment() == Triple::OpenCL) + else if (TargetTriple.getEnvironment() == Triple::OpenCL || + TargetTriple.getVendor() == Triple::AMD) Env = Kernel; else Env = Unknown; @@ -93,6 +98,8 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, // Set the default extensions based on the target triple. if (TargetTriple.getVendor() == Triple::Intel) Extensions.insert(SPIRV::Extension::SPV_INTEL_function_pointers); + if (TargetTriple.getVendor() == Triple::AMD) + Extensions = SPIRVExtensionsParser::getValidExtensions(TargetTriple); // The order of initialization is important. initAvailableExtensions(Extensions); diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 5ba035682238b..2951a4bc695e2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -244,7 +244,8 @@ static cl::opt<bool> SPVEnableNonSemanticDI( cl::Optional, cl::init(false)); void SPIRVPassConfig::addPreEmitPass() { - if (SPVEnableNonSemanticDI) { + if (SPVEnableNonSemanticDI || + getSPIRVTargetMachine().getTargetTriple().getVendor() == Triple::AMD) { addPass(createSPIRVEmitNonSemanticDIPass(&getTM<SPIRVTargetMachine>())); } } diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll index ec4884ff643cb..3e0d0cc4cd8e2 100644 --- a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll +++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll @@ -1,7 +1,9 @@ ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-unknown-unknown %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR +; RUN: llc --verify-machineinstrs --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-amd-amdhsa %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_non_semantic_info %s -o - | FileCheck %s --check-prefix=CHECK-OPTION -; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer. +; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer. ; DISABLED: %if spirv-tools %{ llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-MIR-DAG: [[i32type:%[0-9]+\:type]] = OpTypeInt 32, 0 diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll index b1a555a52f40d..6b4e35e997124 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll @@ -7,6 +7,8 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_optnone,+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK-TWO-EXTENSIONS ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS + ; CHECK-EXTENSION: OpCapability OptNoneEXT ; CHECK-EXTENSION: OpExtension "SPV_EXT_optnone" ; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneINTEL diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll index f745794e11de1..15905dd1894e2 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s define i6 @getConstantI6() { ret i6 2 diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll index afffd9e69b454..11e7d006c5ecf 100644 --- a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll +++ b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll @@ -1,4 +1,6 @@ ; REQUIRES: spirv-tools ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck --check-prefix=AMDGCNSPIRV %s ; CHECK: Generator: {{.*}}{{43|LLVM SPIR-V Backend}}{{.*}} +; AMDGCNSPIRV: Generator: {{.*}}{{65535|LLVM SPIR-V Backend}}{{.*}} diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll index 686c1e97257ad..49ee9931d1126 100644 --- a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll +++ b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll @@ -6,6 +6,7 @@ ; RUN: llc -O0 -mtriple=spirv64v1.4-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV14 ; RUN: llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV15 ; RUN: llc -O0 -mtriple=spirv64v1.6-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV16 +; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=AMDGCNSPIRV ; CHECK-SPIRV10: Version: 1.0 ; CHECK-SPIRV11: Version: 1.1 @@ -14,3 +15,4 @@ ; CHECK-SPIRV14: Version: 1.4 ; CHECK-SPIRV15: Version: 1.5 ; CHECK-SPIRV16: Version: 1.6 +; AMDGCNSPIRV: Version: 1.6 From 747050bcceca18d32dc1140461984ec2c30ae96a Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <dmitry.chigarev@intel.com> Date: Tue, 4 Nov 2025 13:52:23 +0100 Subject: [PATCH 159/313] [MLIR][XeGPU][VectorToXeGPU] Lower vector.load/store/transfer_read/transfer_write to new offsets syntax (#162095) Changes the `VectorToXeGPU` pass to generate `xegpu.load_nd/store_nd` ops using new syntax with where offsets are specified at the load/store ops level. ```mlir // from this %desc = xegpu.create_nd_tdesc %src[%off1, %off2]: memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> %res = xegpu.load_nd %desc : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> // to this %desc = xegpu.create_nd_tdesc %src: memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> %res = xegpu.load_nd %desc[%off1, %off2] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> ``` In order to support cases with dimension reduction at the `create_nd_tdesc` level (e.g. `memref<8x8x16xf16> -> tensor_desc<8x16xf16>` it was decided to insert a memref.subview that collapses the source shape to 2d, for example: ```mlir // input: %0 = vector.load %source[%off0, %off1, %off2] : memref<8x16x32xf32>, vector<8x16xf32> // --vector-to-xegpu (old) %tdesc = xegpu.create_nd_tdesc %source[%off0, %off1, %off2] : memref<8x16x32xf32> -> tdesc<8x32xf32> %vec = xegpu.load_nd %tdesc // --vector-to-xegpu (new) %collapsed = memref.subview %source[%off0, 0, 0] [1, 16, 32] [1, 1, 1] : memref<8x16x32xf32> -> memref<16x32xf32, strided<[32, 1], offset: ?>> %tdesc = xegpu.create_nd_tdesc %collapsed : memref<16x32xf32, ...> -> tdesc<8x32xf32> %vec = xegpu.load_nd %tdesc[%off1, %off2] ``` <details><summary>Why we need to change that?</summary> ```mlir // reduce dim and apply all 3 offsets at load_nd %desc = xegpu.create_nd_tdesc %source : memref<8x16x32xf32> -> !xegpu.tensor_desc<16x32xf32> // error: xegpu.load_nd len(offsets) != desc.rank %res = xegpu.load_nd %desc[%off, %off, %off] : !xegpu.tensor_desc<16x32xf32> -> vector<8x16xf32> ``` </details> --------- Signed-off-by: dchigarev <dmitry.chigarev@intel.com> --- .../VectorToXeGPU/VectorToXeGPU.cpp | 174 +++++++++++------- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 12 +- .../VectorToXeGPU/load-to-xegpu.mlir | 39 ++-- .../VectorToXeGPU/store-to-xegpu.mlir | 39 ++-- .../VectorToXeGPU/transfer-read-to-xegpu.mlir | 53 +++--- .../transfer-write-to-xegpu.mlir | 51 +++-- 6 files changed, 196 insertions(+), 172 deletions(-) diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index 91c1aa55fdb4e..abea84f6b01fe 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -97,57 +97,23 @@ static LogicalResult transferPreconditions(PatternRewriter &rewriter, return success(); } -static xegpu::CreateNdDescOp -createNdDescriptor(PatternRewriter &rewriter, Location loc, - xegpu::TensorDescType descType, TypedValue<MemRefType> src, - Operation::operand_range offsets) { +static xegpu::CreateNdDescOp createNdDescriptor(PatternRewriter &rewriter, + Location loc, + xegpu::TensorDescType descType, + TypedValue<MemRefType> src) { MemRefType srcTy = src.getType(); auto [strides, offset] = srcTy.getStridesAndOffset(); xegpu::CreateNdDescOp ndDesc; if (srcTy.hasStaticShape()) { - ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src, - getAsOpFoldResult(offsets)); + ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src); } else { // In case of any dynamic shapes, source's shape and strides have to be // explicitly provided. - SmallVector<Value> sourceDims; - unsigned srcRank = srcTy.getRank(); - for (unsigned i = 0; i < srcRank; ++i) - sourceDims.push_back(memref::DimOp::create(rewriter, loc, src, i)); - - SmallVector<int64_t> constOffsets; - SmallVector<Value> dynOffsets; - for (Value offset : offsets) { - std::optional<int64_t> staticVal = getConstantIntValue(offset); - if (!staticVal) - dynOffsets.push_back(offset); - constOffsets.push_back(staticVal.value_or(ShapedType::kDynamic)); - } - - SmallVector<Value> dynShapes; - for (auto [idx, shape] : llvm::enumerate(srcTy.getShape())) { - if (shape == ShapedType::kDynamic) - dynShapes.push_back(sourceDims[idx]); - } - - // Compute strides in reverse order. - SmallVector<Value> dynStrides; - Value accStride = arith::ConstantIndexOp::create(rewriter, loc, 1); - // Last stride is guaranteed to be static and unit. - for (int i = static_cast<int>(strides.size()) - 2; i >= 0; --i) { - accStride = - arith::MulIOp::create(rewriter, loc, accStride, sourceDims[i + 1]); - if (strides[i] == ShapedType::kDynamic) - dynStrides.push_back(accStride); - } - std::reverse(dynStrides.begin(), dynStrides.end()); - - ndDesc = xegpu::CreateNdDescOp::create( - rewriter, loc, descType, src, dynOffsets, dynShapes, dynStrides, - DenseI64ArrayAttr::get(rewriter.getContext(), constOffsets), - DenseI64ArrayAttr::get(rewriter.getContext(), srcTy.getShape()), - DenseI64ArrayAttr::get(rewriter.getContext(), strides)); + auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, src); + ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src, + meta.getConstifiedMixedSizes(), + meta.getConstifiedMixedStrides()); } return ndDesc; @@ -392,6 +358,62 @@ static Value computeOffsets(PatternRewriter &rewriter, OpType gatScatOp, .getResult(); } +// Collapses shapes of a nD memref to the target rank while applying offsets for +// the collapsed dimensions. Returns the new memref value and the remaining +// offsets for the last targetRank dimensions. For example: +// input: %memref = memref<2x4x8x32xf32>, offsets=[%i0, %i1, %i2, %i3], +// output: %memref[%i0, %i1, 0, 0] -> memref<8x32xf32>, offsets: [%i2, %i3] +static std::pair<Value, SmallVector<OpFoldResult>> +convertMemrefAndOffsetsToTargetRank(PatternRewriter &rewriter, Location loc, + Value memref, + SmallVector<OpFoldResult> offsets, + int64_t targetRank) { + auto memrefType = cast<MemRefType>(memref.getType()); + unsigned rank = memrefType.getRank(); + + if (rank <= targetRank) + return {memref, offsets}; + + int64_t numCombinedDims = rank - targetRank; + SmallVector<OpFoldResult> subviewOffsets; + SmallVector<OpFoldResult> subviewSizes; + SmallVector<OpFoldResult> subviewStrides; + + // For the combined dimensions: use the provided offsets, size=1, stride=1 + for (unsigned i = 0; i < numCombinedDims; ++i) { + subviewOffsets.push_back(offsets[i]); + subviewSizes.push_back(rewriter.getI64IntegerAttr(1)); + subviewStrides.push_back(rewriter.getI64IntegerAttr(1)); + } + + // For the last targetRank dimensions: offset=0, use full size, stride=1 + SmallVector<int64_t> resultShape; + auto originalShape = memrefType.getShape(); + auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, memref); + for (unsigned i = numCombinedDims; i < rank; ++i) { + subviewOffsets.push_back(rewriter.getI64IntegerAttr(0)); + if (ShapedType::isDynamic(originalShape[i])) { + subviewSizes.push_back(meta.getSizes()[i]); + resultShape.push_back(ShapedType::kDynamic); + } else { + subviewSizes.push_back(rewriter.getI64IntegerAttr(originalShape[i])); + resultShape.push_back(originalShape[i]); + } + subviewStrides.push_back(rewriter.getI64IntegerAttr(1)); + } + + auto resultType = memref::SubViewOp::inferRankReducedResultType( + resultShape, memrefType, subviewOffsets, subviewSizes, subviewStrides); + auto subviewOp = + memref::SubViewOp::create(rewriter, loc, resultType, memref, + subviewOffsets, subviewSizes, subviewStrides); + + // Return the remaining offsets for the last targetRank dimensions + SmallVector<OpFoldResult> newOffsets(offsets.begin() + numCombinedDims, + offsets.end()); + return {subviewOp.getResult(), newOffsets}; +} + template < typename OpType, typename = std::enable_if_t<llvm::is_one_of< @@ -523,18 +545,19 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> { descShape, elementType, /*array_length=*/1, /*boundary_check=*/isOutOfBounds, xegpu::MemorySpace::Global); - xegpu::CreateNdDescOp ndDesc = - createNdDescriptor(rewriter, loc, descType, - dyn_cast<TypedValue<MemRefType>>(readOp.getBase()), - readOp.getIndices()); - DenseI64ArrayAttr transposeAttr = !isTransposeLoad ? nullptr : DenseI64ArrayAttr::get(rewriter.getContext(), ArrayRef<int64_t>{1, 0}); + auto [src, indices] = convertMemrefAndOffsetsToTargetRank( + rewriter, loc, readOp.getBase(), getAsOpFoldResult(readOp.getIndices()), + vecTy.getRank()); // By default, no specific caching policy is assigned. xegpu::CachePolicyAttr hint = nullptr; - auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, + xegpu::CreateNdDescOp ndDesc = createNdDescriptor( + rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src)); + + auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices, /*packed=*/nullptr, transposeAttr, /*l1_hint=*/hint, /*l2_hint=*/hint, /*l3_hint=*/hint); @@ -575,21 +598,23 @@ struct TransferWriteLowering if (!map.isMinorIdentity()) return rewriter.notifyMatchFailure(writeOp, "Expects identity map"); + auto [src, indices] = convertMemrefAndOffsetsToTargetRank( + rewriter, loc, writeOp.getBase(), + getAsOpFoldResult(writeOp.getIndices()), vecTy.getRank()); + auto descType = xegpu::TensorDescType::get( vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1, /*boundary_check=*/writeOp.hasOutOfBoundsDim(), xegpu::MemorySpace::Global); - xegpu::CreateNdDescOp ndDesc = - createNdDescriptor(rewriter, loc, descType, - dyn_cast<TypedValue<MemRefType>>(writeOp.getBase()), - writeOp.getIndices()); - // By default, no specific caching policy is assigned. xegpu::CachePolicyAttr hint = nullptr; - auto storeOp = - xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(), ndDesc, - /*l1_hint=*/hint, - /*l2_hint=*/hint, /*l3_hint=*/hint); + xegpu::CreateNdDescOp ndDesc = createNdDescriptor( + rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src)); + + auto storeOp = xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(), + ndDesc, indices, + /*l1_hint=*/hint, + /*l2_hint=*/hint, /*l3_hint=*/hint); rewriter.replaceOp(writeOp, storeOp); return success(); @@ -674,19 +699,24 @@ struct LoadLowering : public OpRewritePattern<vector::LoadOp> { // Boundary check is available only for block instructions. bool boundaryCheck = vecTy.getRank() > 1; + // By default, no specific caching policy is assigned. + xegpu::CachePolicyAttr hint = nullptr; + + auto [src, indices] = convertMemrefAndOffsetsToTargetRank( + rewriter, loc, loadOp.getBase(), getAsOpFoldResult(loadOp.getIndices()), + vecTy.getRank()); auto descType = xegpu::TensorDescType::get( vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1, boundaryCheck, xegpu::MemorySpace::Global); - xegpu::CreateNdDescOp ndDesc = createNdDescriptor( - rewriter, loc, descType, loadOp.getBase(), loadOp.getIndices()); - // By default, no specific caching policy is assigned. - xegpu::CachePolicyAttr hint = nullptr; - auto loadNdOp = xegpu::LoadNdOp::create( - rewriter, loc, vecTy, ndDesc, /*packed=*/nullptr, /*transpose=*/nullptr, - /*l1_hint=*/hint, - /*l2_hint=*/hint, /*l3_hint=*/hint); + xegpu::CreateNdDescOp ndDesc = createNdDescriptor( + rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src)); + auto loadNdOp = + xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices, + /*packed=*/nullptr, /*transpose=*/nullptr, + /*l1_hint=*/hint, + /*l2_hint=*/hint, /*l3_hint=*/hint); rewriter.replaceOp(loadOp, loadNdOp); return success(); @@ -708,18 +738,24 @@ struct StoreLowering : public OpRewritePattern<vector::StoreOp> { // Boundary check is available only for block instructions. bool boundaryCheck = vecTy.getRank() > 1; + auto [src, indices] = convertMemrefAndOffsetsToTargetRank( + rewriter, loc, storeOp.getBase(), + getAsOpFoldResult(storeOp.getIndices()), vecTy.getRank()); + auto descType = xegpu::TensorDescType::get( vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1, boundaryCheck, xegpu::MemorySpace::Global); - xegpu::CreateNdDescOp ndDesc = createNdDescriptor( - rewriter, loc, descType, storeOp.getBase(), storeOp.getIndices()); // By default, no specific caching policy is assigned. xegpu::CachePolicyAttr hint = nullptr; + xegpu::CreateNdDescOp ndDesc = createNdDescriptor( + rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src)); + auto storeNdOp = - xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc, + xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc, indices, /*l1_hint=*/hint, /*l2_hint=*/hint, /*l3_hint=*/hint); + rewriter.replaceOp(storeOp, storeNdOp); return success(); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 7b6c4b6c2c813..c8f5c86c03686 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -280,8 +280,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, auto [memrefStrides, _] = memrefTy.getStridesAndOffset(); // if shape and strides are from Memref, we don't need attributes for them - // to keep the IR print clean. - if (staticShape == memrefShape && staticStrides == memrefStrides) { + // to keep the IR print clean (only do so for full-static case, otherwise + // printer would fail trying to print empty array-attr). + if (staticShape == memrefShape && staticStrides == memrefStrides && + dynamicShape.empty() && dynamicStrides.empty()) { staticShapeAttr = DenseI64ArrayAttr(); staticStridesAttr = DenseI64ArrayAttr(); } @@ -342,8 +344,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, auto [memrefStrides, _] = memrefTy.getStridesAndOffset(); // if shape and strides are from Memref, we don't need attributes for them - // to keep the IR print clean. - if (staticShape == memrefShape && staticStrides == memrefStrides) { + // to keep the IR print clean (only do so for full-static case, otherwise + // printer would fail trying to print empty array-attr). + if (staticShape == memrefShape && staticStrides == memrefStrides && + dynamicShape.empty() && dynamicStrides.empty()) { staticShapeAttr = DenseI64ArrayAttr(); staticStridesAttr = DenseI64ArrayAttr(); } diff --git a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir index 9908205f07c92..ae5141db16c09 100644 --- a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir @@ -9,11 +9,12 @@ func.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vecto // CHECK-LABEL: @load_1D_vector( // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32, +// CHECK-SAME: %[[COLLAPSED]] +// CHECK-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, // CHECK-SAME: boundary_check = false -// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf32> +// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32> // CHECK: return %[[VEC]] // ----- @@ -28,35 +29,29 @@ func.func @load_2D_vector(%source: memref<8x16x32xf32>, // CHECK-LABEL: @load_2D_vector( // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// CHECK-SAME: %[[COLLAPSED]] +// CHECK-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] // ----- func.func @load_dynamic_source(%source: memref<?x?x?xf32>, - %offset: index) -> vector<8x16xf32> { - %0 = vector.load %source[%offset, %offset, %offset] + %i: index, %j: index, %k: index) -> vector<8x16xf32> { + %0 = vector.load %source[%i, %j, %k] : memref<?x?x?xf32>, vector<8x16xf32> return %0 : vector<8x16xf32> } // CHECK-LABEL: @load_dynamic_source( // CHECK-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, -// CHECK-SAME: %[[OFFSET:.+]]: index -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]] -// CHECK-DAG: %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]] -// CHECK-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] -// CHECK: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] -// CHECK-SAME: memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// CHECK-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] +// CHECK: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] // ----- @@ -72,9 +67,9 @@ func.func @load_out_of_bounds(%source: memref<7x15xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<7x15xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]]] +// CHECK-SAME: %[[SRC]] // CHECK-SAME: memref<7x15xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] // ----- diff --git a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir index 2c498dcc2a071..1a10d917623cc 100644 --- a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir @@ -11,11 +11,12 @@ func.func @store_1D_vector(%vec: vector<8xf32>, // CHECK-SAME: %[[VEC:.+]]: vector<8xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32, +// CHECK-SAME: %[[COLLAPSED]] +// CHECK-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, // CHECK-SAME: boundary_check = false -// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf32> +// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32> // ----- @@ -30,16 +31,17 @@ func.func @store_2D_vector(%vec: vector<8x16xf32>, // CHECK-SAME: %[[VEC:.+]]: vector<8x16xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// CHECK-SAME: %[[COLLAPSED]] +// CHECK-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> // ----- func.func @store_dynamic_source(%vec: vector<8x16xf32>, - %source: memref<?x?x?xf32>, %offset: index) { - vector.store %vec, %source[%offset, %offset, %offset] + %source: memref<?x?x?xf32>, %i: index, %j: index, %k: index) { + vector.store %vec, %source[%i, %j, %k] : memref<?x?x?xf32>, vector<8x16xf32> return } @@ -47,18 +49,11 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>, // CHECK-LABEL: @store_dynamic_source( // CHECK-SAME: %[[VEC:.+]]: vector<8x16xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, -// CHECK-SAME: %[[OFFSET:.+]]: index -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]] -// CHECK-DAG: %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]] -// CHECK-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] -// CHECK: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] -// CHECK-SAME: memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// CHECK-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] +// CHECK: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32> // ----- @@ -74,9 +69,9 @@ func.func @store_out_of_bounds(%vec: vector<8x16xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<7x64xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]]] +// CHECK-SAME: %[[SRC]] // CHECK-SAME: memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> // ----- diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir index c4ca79af1bd9a..c87a5304babfe 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir @@ -12,11 +12,12 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector // LOAD-ND-LABEL: @load_1D_vector( // LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // LOAD-ND-SAME: %[[OFFSET:.+]]: index +// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// LOAD-ND-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32, +// LOAD-ND-SAME: %[[COLLAPSED]] +// LOAD-ND-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, // LOAD-ND-SAME: boundary_check = false -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf32> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32> // LOAD-ND: return %[[VEC]] // LOAD-GATHER-LABEL: @load_1D_vector( @@ -46,11 +47,12 @@ gpu.func @load_2D_vector(%source: memref<8x16x32xf32>, // LOAD-ND-LABEL: @load_2D_vector( // LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // LOAD-ND-SAME: %[[OFFSET:.+]]: index +// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// LOAD-ND-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32, +// LOAD-ND-SAME: %[[COLLAPSED]] +// LOAD-ND-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, // LOAD-ND-SAME: boundary_check = false -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // LOAD-ND: return %[[VEC]] // LOAD-GATHER-LABEL: @load_2D_vector( @@ -83,9 +85,9 @@ gpu.func @load_zero_pad_out_of_bounds(%source: memref<32x64xf32>, // LOAD-ND-LABEL: @load_zero_pad_out_of_bounds( // LOAD-ND-SAME: %[[SRC:.+]]: memref<32x64xf32>, // LOAD-ND-SAME: %[[OFFSET:.+]]: index -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]]] +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]] // LOAD-ND-SAME: memref<32x64xf32> -> !xegpu.tensor_desc<8x16xf32> -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // LOAD-ND: return %[[VEC]] // LOAD-GATHER-LABEL: @load_zero_pad_out_of_bounds( @@ -109,9 +111,9 @@ gpu.func @load_transposed(%source: memref<32x64xf32>, // LOAD-ND-SAME: %[[SRC:.+]]: memref<32x64xf32>, // LOAD-ND-SAME: %[[OFFSET1:.+]]: index, // LOAD-ND-SAME: %[[OFFSET2:.+]]: index -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET1]], %[[OFFSET2]]] +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]] // LOAD-ND-SAME: memref<32x64xf32> -> !xegpu.tensor_desc<16x8xf32 -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]] <{transpose = array<i64: 1, 0>}> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET1]], %[[OFFSET2]]] <{transpose = array<i64: 1, 0>}> // LOAD-ND-SAME: -> vector<8x16xf32> // LOAD-ND: return %[[VEC]] @@ -143,16 +145,11 @@ gpu.func @load_dynamic_source(%source: memref<?x?x?xf32>, } // LOAD-ND-LABEL: @load_dynamic_source( // LOAD-ND-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, -// LOAD-ND-SAME: %[[OFFSET:.+]]: index -// LOAD-ND: %[[C2:.+]] = arith.constant 2 : index -// LOAD-ND: %[[C1:.+]] = arith.constant 1 : index -// LOAD-ND: %[[C0:.+]] = arith.constant 0 : index -// LOAD-ND-DAG: %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]] -// LOAD-ND-DAG: %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]] -// LOAD-ND-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] -// LOAD-ND: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET:.+]], %[[OFFSET:.+]], %[[OFFSET:.+]]] -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// LOAD-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] +// LOAD-ND: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32> // LOAD-ND: return %[[VEC]] @@ -184,10 +181,11 @@ gpu.func @load_dynamic_source2(%source: memref<?x8x16xf32>, } // LOAD-ND-LABEL: @load_dynamic_source2( -// LOAD-ND-DAG: %[[C0:.+]] = arith.constant 0 : index -// LOAD-ND-DAG: %[[DIM:.+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x8x16xf32> -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], shape : [%[[DIM]], 8, 16], strides : [128, 16, 1] : memref<?x8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32> +// LOAD-ND-SAME: %[[SRC:.+]]: memref<?x8x16xf32>, +// LOAD-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32, strided<[16, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32> // LOAD-ND: return %[[VEC]] : vector<8x16xf32> // LOAD-GATHER-LABEL: @load_dynamic_source2( @@ -418,11 +416,12 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: // LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> +// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0] // LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]] -// LOAD-ND-SAME: memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16, +// LOAD-ND-SAME: %[[COLLAPSED]] +// LOAD-ND-SAME: memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16, // LOAD-ND-SAME: boundary_check = false -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf16> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]]]{{.*}}-> vector<8xf16> // LOAD-ND: return %[[VEC]] // LOAD-GATHER-LABEL: @load_from_subview( diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir index fcfc9414da4f6..43a1a7206e2cc 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --xevm-attach-target='module=xevm.* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-ND +// RUN: mlir-opt %s --xevm-attach-target='module=xevm_* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-ND // RUN: mlir-opt %s -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-SCATTER @@ -15,11 +15,12 @@ gpu.func @store_1D_vector(%vec: vector<8xf32>, // STORE-ND-SAME: %[[VEC:.+]]: vector<8xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // STORE-ND-SAME: %[[OFFSET:.+]]: index +// STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// STORE-ND-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32, +// STORE-ND-SAME: %[[COLLAPSED]] +// STORE-ND-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, // STORE-ND-SAME: boundary_check = false -// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf32> +// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32> // STORE-SCATTER-LABEL: @store_1D_vector( // STORE-SCATTER-SAME: %[[VEC:.+]]: vector<8xf32>, @@ -49,11 +50,12 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>, // STORE-ND-SAME: %[[VEC:.+]]: vector<8x16xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // STORE-ND-SAME: %[[OFFSET:.+]]: index +// STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// STORE-ND-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32, +// STORE-ND-SAME: %[[COLLAPSED]] +// STORE-ND-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, // STORE-ND-SAME: boundary_check = false -// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> // STORE-SCATTER-LABEL: @store_2D_vector( // STORE-SCATTER-SAME: %[[VEC:.+]]: vector<8x16xf32>, @@ -73,8 +75,8 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>, // ----- gpu.module @xevm_module { gpu.func @store_dynamic_source(%vec: vector<8x16xf32>, - %source: memref<?x?x?xf32>, %offset: index) { - vector.transfer_write %vec, %source[%offset, %offset, %offset] + %source: memref<?x?x?xf32>, %i: index, %j: index, %k: index) { + vector.transfer_write %vec, %source[%i, %j, %k] {in_bounds = [true, true]} : vector<8x16xf32>, memref<?x?x?xf32> gpu.return @@ -83,18 +85,11 @@ gpu.func @store_dynamic_source(%vec: vector<8x16xf32>, // STORE-ND-LABEL: @store_dynamic_source( // STORE-ND-SAME: %[[VEC:.+]]: vector<8x16xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, -// STORE-ND-SAME: %[[OFFSET:.+]]: index -// STORE-ND-DAG: %[[C0:.+]] = arith.constant 0 : index -// STORE-ND-DAG: %[[C1:.+]] = arith.constant 1 : index -// STORE-ND-DAG: %[[C2:.+]] = arith.constant 2 : index -// STORE-ND-DAG: %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]] -// STORE-ND-DAG: %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]] -// STORE-ND-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] -// STORE-ND: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] -// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// STORE-ND-SAME: , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] -// STORE-ND-SAME: memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32 -// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// STORE-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] +// STORE-ND: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32> // STORE-SCATTER-LABEL: @store_dynamic_source( // STORE-SCATTER-SAME: %[[VEC:.+]]: vector<8x16xf32>, @@ -126,9 +121,9 @@ gpu.func @store_out_of_bounds(%vec: vector<8x16xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<7x64xf32>, // STORE-ND-SAME: %[[OFFSET:.+]]: index // STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]]] +// STORE-ND-SAME: %[[SRC]] // STORE-ND-SAME: memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32> -// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> // STORE-SCATTER-LABEL: @store_out_of_bounds( // STORE-SCATTER: vector.transfer_write @@ -298,13 +293,13 @@ gpu.func @store_to_subview(%vec: vector<8xf16>, // STORE-ND-SAME: %[[VEC:.+]]: vector<8xf16>, // STORE-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // STORE-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index -// STORE-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] -// STORE-ND-SAME: : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> +// STORE-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> +// STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0] // STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]] -// STORE-ND-SAME: memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16, +// STORE-ND-SAME: %[[COLLAPSED]] +// STORE-ND-SAME: memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16, // STORE-ND-SAME: boundary_check = false -// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf16> +// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF2]]] : vector<8xf16> // STORE-SCATTER-LABEL: @store_to_subview( // STORE-SCATTER-SAME: %[[VEC:.+]]: vector<8xf16>, From 8a84b285f67cb778493e225dc9699d902921e7b0 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng <dtcxzyw2333@gmail.com> Date: Tue, 4 Nov 2025 20:55:33 +0800 Subject: [PATCH 160/313] [SimplifyCFG] Eliminate dead edges of switches according to the domain of conditions (#165748) In simplifycfg/cvp/sccp, we eliminate dead edges of switches according to the knownbits/range info of conditions. However, these approximations may not meet the real-world needs when the domain of condition values is sparse. For example, if the condition can only be either -3 or 3, we cannot prove that the condition never evaluates to 1 (knownbits: ???????1, range: [-3, 4)). This patch adds a helper function `collectPossibleValues` to enumerate all the possible values of V. To fix the motivating issue, `eliminateDeadSwitchCases` will use the result to remove dead edges. Note: In https://discourse.llvm.org/t/missed-optimization-due-to-overflow-check/88700 I proposed a new value lattice kind to represent such values. But I find it hard to apply because the transition becomes much complicated. Compile-time impact looks neutral: https://llvm-compile-time-tracker.com/compare.php?from=32d6b2139a6c8f79e074e8c6cfe0cc9e79c4c0c8&to=e47c26e3f1bf9eb062684dda4fafce58438e994b&stat=instructions:u This patch removes many dead error-handling codes: https://github.com/dtcxzyw/llvm-opt-benchmark/pull/3012 Closes https://github.com/llvm/llvm-project/issues/165179. --- llvm/include/llvm/Analysis/ValueTracking.h | 10 ++ llvm/lib/Analysis/ValueTracking.cpp | 52 ++++++++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 67 ++++++---- llvm/test/CodeGen/AArch64/arm64-ccmp.ll | 4 +- llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll | 6 +- ...-on-const-select.ll => switch-on-const.ll} | 126 ++++++++++++++++++ .../Transforms/SimplifyCFG/switch_mask.ll | 1 + .../Transforms/SimplifyCFG/switch_undef.ll | 7 +- 8 files changed, 237 insertions(+), 36 deletions(-) rename llvm/test/Transforms/SimplifyCFG/{switch-on-const-select.ll => switch-on-const.ll} (54%) diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index af218ba564081..093309cb8bbee 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -1024,6 +1024,16 @@ findValuesAffectedByCondition(Value *Cond, bool IsAssume, LLVM_ABI Value *stripNullTest(Value *V); LLVM_ABI const Value *stripNullTest(const Value *V); +/// Enumerates all possible values of V and inserts them into the set \p +/// Constants. If \p AllowUndefOrPoison is false, it fails when V may contain +/// undef/poison elements. Returns true if the result is complete. Otherwise, +/// the result is incomplete (more than MaxCount values). +/// NOTE: The constant values are not distinct. +LLVM_ABI bool +collectPossibleValues(const Value *V, + SmallPtrSetImpl<const Constant *> &Constants, + unsigned MaxCount, bool AllowUndefOrPoison = true); + } // end namespace llvm #endif // LLVM_ANALYSIS_VALUETRACKING_H diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 523374bdc472f..789a98366cead 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -10341,3 +10341,55 @@ const Value *llvm::stripNullTest(const Value *V) { Value *llvm::stripNullTest(Value *V) { return const_cast<Value *>(stripNullTest(const_cast<const Value *>(V))); } + +bool llvm::collectPossibleValues(const Value *V, + SmallPtrSetImpl<const Constant *> &Constants, + unsigned MaxCount, bool AllowUndefOrPoison) { + SmallPtrSet<const Instruction *, 8> Visited; + SmallVector<const Instruction *, 8> Worklist; + auto Push = [&](const Value *V) -> bool { + if (auto *C = dyn_cast<Constant>(V)) { + if (!AllowUndefOrPoison && !isGuaranteedNotToBeUndefOrPoison(C)) + return false; + // Check existence first to avoid unnecessary allocations. + if (Constants.contains(C)) + return true; + if (Constants.size() == MaxCount) + return false; + Constants.insert(C); + return true; + } + + if (auto *Inst = dyn_cast<Instruction>(V)) { + if (Visited.insert(Inst).second) + Worklist.push_back(Inst); + return true; + } + return false; + }; + if (!Push(V)) + return false; + while (!Worklist.empty()) { + const Instruction *CurInst = Worklist.pop_back_val(); + switch (CurInst->getOpcode()) { + case Instruction::Select: + if (!Push(CurInst->getOperand(1))) + return false; + if (!Push(CurInst->getOperand(2))) + return false; + break; + case Instruction::PHI: + for (Value *IncomingValue : cast<PHINode>(CurInst)->incoming_values()) { + // Fast path for recurrence PHI. + if (IncomingValue == CurInst) + continue; + if (!Push(IncomingValue)) + return false; + } + break; + default: + return false; + } + } + return true; +} diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index bb733277e0fad..532511dcf91b0 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -6020,6 +6020,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const DataLayout &DL) { Value *Cond = SI->getCondition(); KnownBits Known = computeKnownBits(Cond, DL, AC, SI); + SmallPtrSet<const Constant *, 4> KnownValues; + bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) @@ -6039,15 +6041,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, UniqueSuccessors.push_back(Successor); ++It->second; } - const APInt &CaseVal = Case.getCaseValue()->getValue(); + ConstantInt *CaseC = Case.getCaseValue(); + const APInt &CaseVal = CaseC->getValue(); if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || - (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) { - DeadCases.push_back(Case.getCaseValue()); + (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) || + (IsKnownValuesValid && !KnownValues.contains(CaseC))) { + DeadCases.push_back(CaseC); if (DTU) --NumPerSuccessorCases[Successor]; LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n"); - } + } else if (IsKnownValuesValid) + KnownValues.erase(CaseC); } // If we can prove that the cases must cover all possible values, the @@ -6058,33 +6063,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const unsigned NumUnknownBits = Known.getBitWidth() - (Known.Zero | Known.One).popcount(); assert(NumUnknownBits <= Known.getBitWidth()); - if (HasDefault && DeadCases.empty() && - NumUnknownBits < 64 /* avoid overflow */) { - uint64_t AllNumCases = 1ULL << NumUnknownBits; - if (SI->getNumCases() == AllNumCases) { + if (HasDefault && DeadCases.empty()) { + if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) { createUnreachableSwitchDefault(SI, DTU); return true; } - // When only one case value is missing, replace default with that case. - // Eliminating the default branch will provide more opportunities for - // optimization, such as lookup tables. - if (SI->getNumCases() == AllNumCases - 1) { - assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); - IntegerType *CondTy = cast<IntegerType>(Cond->getType()); - if (CondTy->getIntegerBitWidth() > 64 || - !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) - return false; - uint64_t MissingCaseVal = 0; - for (const auto &Case : SI->cases()) - MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); - auto *MissingCase = - cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal)); - SwitchInstProfUpdateWrapper SIW(*SI); - SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0)); - createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false); - SIW.setSuccessorWeight(0, 0); - return true; + if (NumUnknownBits < 64 /* avoid overflow */) { + uint64_t AllNumCases = 1ULL << NumUnknownBits; + if (SI->getNumCases() == AllNumCases) { + createUnreachableSwitchDefault(SI, DTU); + return true; + } + // When only one case value is missing, replace default with that case. + // Eliminating the default branch will provide more opportunities for + // optimization, such as lookup tables. + if (SI->getNumCases() == AllNumCases - 1) { + assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); + IntegerType *CondTy = cast<IntegerType>(Cond->getType()); + if (CondTy->getIntegerBitWidth() > 64 || + !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) + return false; + + uint64_t MissingCaseVal = 0; + for (const auto &Case : SI->cases()) + MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); + auto *MissingCase = cast<ConstantInt>( + ConstantInt::get(Cond->getType(), MissingCaseVal)); + SwitchInstProfUpdateWrapper SIW(*SI); + SIW.addCase(MissingCase, SI->getDefaultDest(), + SIW.getSuccessorWeight(0)); + createUnreachableSwitchDefault(SI, DTU, + /*RemoveOrigDefaultBlock*/ false); + SIW.setSuccessorWeight(0, 0); + return true; + } } } diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll index cad5df0d9655e..68ab8902767b3 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll @@ -430,12 +430,12 @@ declare i32 @foo() ; Test case distilled from 126.gcc. ; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor. -define void @build_modify_expr() nounwind ssp { +define void @build_modify_expr(i32 %cond) nounwind ssp { ; CHECK-LABEL: build_modify_expr: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: ret entry: - switch i32 undef, label %sw.bb.i.i [ + switch i32 %cond, label %sw.bb.i.i [ i32 69, label %if.end85 i32 70, label %if.end85 i32 71, label %if.end85 diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll index 559bb68741e12..930cf8152b756 100644 --- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll +++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll @@ -6,11 +6,11 @@ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" target triple = "hexagon" -define i32 @fred(ptr %a0) #0 { +define i32 @fred(ptr %a0, i32 %cond) #0 { ; CHECK-LABEL: fred: ; CHECK: // %bb.0: // %b0 ; CHECK-NEXT: { -; CHECK-NEXT: if (p0) jump:nt .LBB0_2 +; CHECK-NEXT: p0 = cmp.eq(r1,#5); if (!p0.new) jump:t .LBB0_2 ; CHECK-NEXT: } ; CHECK-NEXT: // %bb.1: // %b2 ; CHECK-NEXT: { @@ -40,7 +40,7 @@ define i32 @fred(ptr %a0) #0 { ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } b0: - switch i32 undef, label %b14 [ + switch i32 %cond, label %b14 [ i32 5, label %b2 i32 3, label %b1 ] diff --git a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll similarity index 54% rename from llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll rename to llvm/test/Transforms/SimplifyCFG/switch-on-const.ll index e8b58639c13dd..1ab1b5e8bd838 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll @@ -154,6 +154,132 @@ bees: unreachable } +define void @pr165179(i1 %cond) { +; CHECK-LABEL: @pr165179( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br label [[SWITCHBB:%.*]] +; CHECK: if.else: +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[SWITCHBB]] +; CHECK: exit: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %if.then, label %if.else + +if.then: + tail call void @bees.a() nounwind + br label %switchbb + +if.else: + tail call void @bees.b() nounwind + br label %switchbb + +switchbb: + %cond1 = phi i32 [ 1, %if.else ], [ -1, %if.then ] + switch i32 %cond1, label %default [ + i32 1, label %exit + i32 -1, label %exit + ] + +exit: + tail call void @bees.a() nounwind + ret void + +default: + tail call void @bees.b() nounwind + ret void +} + +define void @switch_remove_dead_case_phi(i1 %cond1, i1 %cond2) { +; CHECK-LABEL: @switch_remove_dead_case_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND1:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br i1 [[COND2:%.*]], label [[SWITCHBB:%.*]], label [[IF_ELSE]] +; CHECK: if.else: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ -1, [[IF_THEN]] ] +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[SWITCHBB]] +; CHECK: switchbb: +; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[PHI]], [[IF_ELSE]] ], [ 5, [[IF_THEN]] ] +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i32 [[COND]], -1 +; CHECK-NEXT: br i1 [[COND3]], label [[EXIT:%.*]], label [[DEFAULT:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: default: +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET]] +; +entry: + br i1 %cond1, label %if.then, label %if.else + +if.then: + tail call void @bees.a() nounwind + br i1 %cond2, label %switchbb, label %if.else + +if.else: + %phi = phi i32 [ 3, %entry ], [ -1, %if.then ] + tail call void @bees.b() nounwind + br label %switchbb + +switchbb: + %cond = phi i32 [ %phi, %if.else ], [ 5, %if.then ] + switch i32 %cond, label %default [ + i32 1, label %exit + i32 -1, label %exit + ] + +exit: + tail call void @bees.a() nounwind + ret void + +default: + tail call void @bees.b() nounwind + ret void +} + +define void @switch_remove_dead_case_select(i1 %cond1, i1 %cond2) { +; CHECK-LABEL: @switch_remove_dead_case_select( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = select i1 [[COND1:%.*]], i32 -1, i32 3 +; CHECK-NEXT: [[Y:%.*]] = select i1 [[COND2:%.*]], i32 [[X]], i32 5 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[Y]], -1 +; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[DEFAULT:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: default: +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET]] +; +entry: + %x = select i1 %cond1, i32 -1, i32 3 + %y = select i1 %cond2, i32 %x, i32 5 + switch i32 %y, label %default [ + i32 1, label %exit + i32 -1, label %exit + ] + +exit: + tail call void @bees.a() nounwind + ret void + +default: + tail call void @bees.b() nounwind + ret void +} + declare void @llvm.trap() nounwind noreturn declare void @bees.a() nounwind declare void @bees.b() nounwind diff --git a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll index f8bcbc057a7ae..428c18fc18e3d 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll @@ -221,6 +221,7 @@ define i1 @pr88607() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[COND:%.*]] = select i1 false, i32 4, i32 1 ; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 false, i32 2, i32 [[COND]] +; CHECK-NEXT: [[COND1:%.*]] = icmp eq i32 [[SPEC_SELECT]], 1 ; CHECK-NEXT: ret i1 false ; entry: diff --git a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll index 88a729b7d941a..4de5ea948ed27 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll @@ -5,12 +5,11 @@ define void @f6() #0 { ; CHECK-LABEL: @f6( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_COND_I:%.*]] -; CHECK: for.cond.i: +; CHECK-NEXT: br label [[F1_EXIT_I:%.*]] +; CHECK: f1.exit.i: ; CHECK-NEXT: [[TOBOOL7_I:%.*]] = icmp ne i16 1, 0 -; CHECK-NEXT: br label [[FOR_COND_I]] +; CHECK-NEXT: br label [[F1_EXIT_I]] ; - entry: br label %for.cond.i From f771f1ee2697ee4c4d86de060f6ec8c7b4041b84 Mon Sep 17 00:00:00 2001 From: Michael Buch <michaelbuch12@gmail.com> Date: Tue, 4 Nov 2025 13:11:52 +0000 Subject: [PATCH 161/313] [llvm-dwarfdump][CMake] Link against BinaryFormat (#166364) In https://github.com/llvm/llvm-project/pull/165720 we started using a DWARF API (`llvm::dwarf::getTag`) from `BinaryFormat`. This patch makes dwarfdump link against the necessary LLVM component. This fixes following linker error that started occurring on some of the bots: ``` [7758/8172] Linking CXX executable bin/llvm-dwarfdump FAILED: bin/llvm-dwarfdump : && /usr/bin/c++ -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wimplicit-fallthrough -Wno-uninitialized -Wno-nonnull -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-array-bounds -Wno-stringop-overread -Wno-noexcept-type -Wdelete-non-virtual-dtor -Wsuggest-override -Wno-comment -Wno-misleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -O3 -DNDEBUG -Wl,-rpath-link,/home/botworker/bbot/amdgpu-offload-ubuntu-22-cmake-build-only/build/./lib -Wl,--gc-sections tools/llvm-dwarfdump/CMakeFiles/llvm-dwarfdump.dir/SectionSizes.cpp.o tools/llvm-dwarfdump/CMakeFiles/llvm-dwarfdump.dir/Statistics.cpp.o tools/llvm-dwarfdump/CMakeFiles/llvm-dwarfdump.dir/llvm-dwarfdump.cpp.o -o bin/llvm-dwarfdump -Wl,-rpath,"\$ORIGIN/../lib:/home/botworker/bbot/amdgpu-offload-ubuntu-22-cmake-build-only/build/lib:" lib/libLLVMAMDGPUDesc.so.22.0git lib/libLLVMSPIRVDesc.so.22.0git lib/libLLVMX86Desc.so.22.0git lib/libLLVMAMDGPUInfo.so.22.0git lib/libLLVMSPIRVInfo.so.22.0git lib/libLLVMX86Info.so.22.0git lib/libLLVMDebugInfoDWARF.so.22.0git lib/libLLVMObject.so.22.0git lib/libLLVMMC.so.22.0git lib/libLLVMDebugInfoDWARFLowLevel.so.22.0git lib/libLLVMTargetParser.so.22.0git lib/libLLVMSupport.so.22.0git -Wl,-rpath-link,/home/botworker/bbot/amdgpu-offload-ubuntu-22-cmake-build-only/build/lib && : /usr/bin/ld: tools/llvm-dwarfdump/CMakeFiles/llvm-dwarfdump.dir/llvm-dwarfdump.cpp.o: undefined reference to symbol '_ZN4llvm5dwarf6getTagENS_9StringRefE' /usr/bin/ld: /home/botworker/bbot/amdgpu-offload-ubuntu-22-cmake-build-only/build/./lib/libLLVMBinaryFormat.so.22.0git: error adding symbols: DSO missing from command line collect2: error: ld returned 1 exit status ``` --- llvm/tools/llvm-dwarfdump/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/tools/llvm-dwarfdump/CMakeLists.txt b/llvm/tools/llvm-dwarfdump/CMakeLists.txt index aeb1b8f14d830..7a0adf32e938c 100644 --- a/llvm/tools/llvm-dwarfdump/CMakeLists.txt +++ b/llvm/tools/llvm-dwarfdump/CMakeLists.txt @@ -1,4 +1,5 @@ set(LLVM_LINK_COMPONENTS + BinaryFormat DebugInfoDWARF DebugInfoDWARFLowLevel AllTargetsDescs From 2e89b71906307d7394be503e07939a42d4449d51 Mon Sep 17 00:00:00 2001 From: Aaron Ballman <aaron@aaronballman.com> Date: Tue, 4 Nov 2025 08:17:50 -0500 Subject: [PATCH 162/313] [C23] Correctly handle missing embed with -MG (#166188) -MG is supposed to suppress "file not found" diagnostics and instead treat those as generated files for purposes of dependency scanning. Clang was previously emitting the diagnostic instead of emitting the name of the embedded file. Fixes #165632 --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Frontend/DependencyFile.cpp | 11 +++++++++++ clang/lib/Lex/PPDirectives.cpp | 2 +- clang/test/Driver/mg.c | 4 +++- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 2e2c5198fa8f5..3f57ddc92d5e8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -209,6 +209,8 @@ C23 Feature Support `WG14 N2710 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2710.htm>`_. - Fixed accepting as compatible unnamed tag types with the same fields within the same translation unit but from different types. +- ``-MG`` now silences the "file not found" errors with ``#embed`` when + scanning for dependencies and encountering an unknown file. #GH165632 Non-comprehensive list of changes in this release ------------------------------------------------- diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp index 15fa7de35df97..93e012b163878 100644 --- a/clang/lib/Frontend/DependencyFile.cpp +++ b/clang/lib/Frontend/DependencyFile.cpp @@ -75,6 +75,17 @@ struct DepCollectorPPCallbacks : public PPCallbacks { /*IsMissing*/ false); } + bool EmbedFileNotFound(StringRef FileName) override { + DepCollector.maybeAddDependency( + llvm::sys::path::remove_leading_dotslash(FileName), + /*FromModule=*/false, + /*IsSystem=*/false, + /*IsModuleFile=*/false, + /*IsMissing=*/true); + // Return true to silence the file not found diagnostic. + return true; + } + void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 6a5e5d4bad3a6..891c8ab7f3155 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -4018,7 +4018,7 @@ void Preprocessor::HandleEmbedDirective(SourceLocation HashLoc, Token &EmbedTok, this->LookupEmbedFile(Filename, isAngled, true, LookupFromFile); if (!MaybeFileRef) { // could not find file - if (Callbacks && Callbacks->EmbedFileNotFound(OriginalFilename)) { + if (Callbacks && Callbacks->EmbedFileNotFound(Filename)) { return; } Diag(FilenameTok, diag::err_pp_file_not_found) << Filename; diff --git a/clang/test/Driver/mg.c b/clang/test/Driver/mg.c index 82d8a6084e5e0..b7458a08698d3 100644 --- a/clang/test/Driver/mg.c +++ b/clang/test/Driver/mg.c @@ -1,5 +1,7 @@ -// RUN: %clang -M -MG -include nonexistent-preinclude.h %s | FileCheck %s +// RUN: %clang -M -MG -include nonexistent-preinclude.h -std=c23 %s | FileCheck %s // CHECK: nonexistent-preinclude.h // CHECK: nonexistent-ppinclude.h +// CHECK: nonexistent-embed #include "nonexistent-ppinclude.h" +#embed "nonexistent-embed" From 89c26170394824c3b636dad0b799256848179fb4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <llvm-dev@redking.me.uk> Date: Tue, 4 Nov 2025 13:20:26 +0000 Subject: [PATCH 163/313] [X86] bittest-big-integer.ll - add test showing multiple uses of the RMW store chain AND its stored value (#166366) --- llvm/test/CodeGen/X86/bittest-big-integer.ll | 263 +++++++++++++++++++ 1 file changed, 263 insertions(+) diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 040ae65a33251..bcb14fd25b975 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -1083,6 +1083,269 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ret i32 %ret } +; Multiple uses of the store chain AND stored value +define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind { +; X86-LABEL: chain_reset_i256: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $112, %esp +; X86-NEXT: movzbl 20(%ebp), %ecx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $28, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 72(%esp,%eax), %edx +; X86-NEXT: movl 76(%esp,%eax), %edi +; X86-NEXT: movl %edi, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 68(%esp,%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %edx +; X86-NEXT: movl 84(%esp,%eax), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %edx +; X86-NEXT: movl 64(%esp,%eax), %edi +; X86-NEXT: movl 88(%esp,%eax), %esi +; X86-NEXT: movl 92(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl %cl, %ebx, %esi +; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: notl %eax +; X86-NEXT: notl %edx +; X86-NEXT: notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: notl %edi +; X86-NEXT: notl %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: notl %esi +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: andl 12(%ecx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 8(%ecx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: andl 20(%ecx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl 16(%ecx), %edx +; X86-NEXT: andl 28(%ecx), %eax +; X86-NEXT: andl 24(%ecx), %ebx +; X86-NEXT: andl 4(%ecx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: andl (%ecx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, 24(%ecx) +; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl %edx, 16(%ecx) +; X86-NEXT: movl %edi, 20(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, 12(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, (%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 4(%ecx) +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: jne .LBB23_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: .LBB23_2: +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: chain_reset_i256: +; SSE: # %bb.0: +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: shrb $3, %al +; SSE-NEXT: andb $24, %al +; SSE-NEXT: negb %al +; SSE-NEXT: movsbq %al, %r10 +; SSE-NEXT: movq -24(%rsp,%r10), %r8 +; SSE-NEXT: movq -16(%rsp,%r10), %rax +; SSE-NEXT: shldq %cl, %r8, %rax +; SSE-NEXT: movq -32(%rsp,%r10), %r9 +; SSE-NEXT: shldq %cl, %r9, %r8 +; SSE-NEXT: movq -40(%rsp,%r10), %r10 +; SSE-NEXT: shldq %cl, %r10, %r9 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shlq %cl, %r10 +; SSE-NEXT: notq %r8 +; SSE-NEXT: notq %rax +; SSE-NEXT: notq %r10 +; SSE-NEXT: notq %r9 +; SSE-NEXT: andq 24(%rdi), %rax +; SSE-NEXT: andq 16(%rdi), %r8 +; SSE-NEXT: andq 8(%rdi), %r9 +; SSE-NEXT: andq (%rdi), %r10 +; SSE-NEXT: movq %r8, 16(%rdi) +; SSE-NEXT: movq %rax, 24(%rdi) +; SSE-NEXT: movq %r10, (%rdi) +; SSE-NEXT: movq %r9, 8(%rdi) +; SSE-NEXT: orq %rax, %r9 +; SSE-NEXT: orq %r10, %r8 +; SSE-NEXT: movl (%rsi), %eax +; SSE-NEXT: movl %r10d, (%rsi) +; SSE-NEXT: movl (%rdx), %ecx +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: orq %r9, %r8 +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: retq +; +; AVX2-LABEL: chain_reset_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: shrb $3, %al +; AVX2-NEXT: andb $24, %al +; AVX2-NEXT: negb %al +; AVX2-NEXT: movsbq %al, %rax +; AVX2-NEXT: movq -32(%rsp,%rax), %r8 +; AVX2-NEXT: movq -24(%rsp,%rax), %r9 +; AVX2-NEXT: movq %r9, %r10 +; AVX2-NEXT: shldq %cl, %r8, %r10 +; AVX2-NEXT: movq -40(%rsp,%rax), %r11 +; AVX2-NEXT: movq -16(%rsp,%rax), %rax +; AVX2-NEXT: shldq %cl, %r9, %rax +; AVX2-NEXT: shldq %cl, %r11, %r8 +; AVX2-NEXT: andnq 24(%rdi), %rax, %rax +; AVX2-NEXT: andnq 16(%rdi), %r10, %r9 +; AVX2-NEXT: andnq 8(%rdi), %r8, %r8 +; AVX2-NEXT: shlxq %rcx, %r11, %rcx +; AVX2-NEXT: andnq (%rdi), %rcx, %rcx +; AVX2-NEXT: movq %r9, 16(%rdi) +; AVX2-NEXT: movq %rax, 24(%rdi) +; AVX2-NEXT: movq %rcx, (%rdi) +; AVX2-NEXT: movq %r8, 8(%rdi) +; AVX2-NEXT: orq %rax, %r8 +; AVX2-NEXT: orq %rcx, %r9 +; AVX2-NEXT: movl (%rsi), %eax +; AVX2-NEXT: movl %ecx, (%rsi) +; AVX2-NEXT: movl (%rdx), %ecx +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: orq %r8, %r9 +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: chain_reset_i256: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: shrb $3, %al +; AVX512-NEXT: andb $24, %al +; AVX512-NEXT: negb %al +; AVX512-NEXT: movsbq %al, %rax +; AVX512-NEXT: movq -40(%rsp,%rax), %r8 +; AVX512-NEXT: movq -32(%rsp,%rax), %r9 +; AVX512-NEXT: movq -24(%rsp,%rax), %r10 +; AVX512-NEXT: movq %r10, %r11 +; AVX512-NEXT: shldq %cl, %r9, %r11 +; AVX512-NEXT: movq -16(%rsp,%rax), %rax +; AVX512-NEXT: shldq %cl, %r10, %rax +; AVX512-NEXT: shlxq %rcx, %r8, %r10 +; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512-NEXT: shldq %cl, %r8, %r9 +; AVX512-NEXT: andnq 24(%rdi), %rax, %rax +; AVX512-NEXT: andnq 16(%rdi), %r11, %rcx +; AVX512-NEXT: andnq 8(%rdi), %r9, %r8 +; AVX512-NEXT: andnq (%rdi), %r10, %r9 +; AVX512-NEXT: movq %rcx, 16(%rdi) +; AVX512-NEXT: movq %rax, 24(%rdi) +; AVX512-NEXT: movq %r9, (%rdi) +; AVX512-NEXT: movq %r8, 8(%rdi) +; AVX512-NEXT: orq %rax, %r8 +; AVX512-NEXT: orq %r9, %rcx +; AVX512-NEXT: movl (%rsi), %eax +; AVX512-NEXT: movl %r9d, (%rsi) +; AVX512-NEXT: movl (%rdx), %edx +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: orq %r8, %rcx +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %rem = and i32 %position, 255 + %ofs = zext nneg i32 %rem to i256 + %bit = shl nuw i256 1, %ofs + %ld0 = load i256, ptr %p0 + %msk = xor i256 %bit, -1 + %res = and i256 %ld0, %msk + store i256 %res, ptr %p0 + %cmp = icmp ne i256 %res, 0 + %ld1 = load i32, ptr %p1 + %trunc = trunc i256 %res to i32 + store i32 %trunc, ptr %p1 + %ld2 = load i32, ptr %p2 + %add = add i32 %ld1, %ld2 + %sel = select i1 %cmp, i32 %ld2, i32 %add + ret i32 %sel +} + ; BTC/BT/BTS sequence on same i128 define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind { ; X86-LABEL: sequence_i128: From 12f392cff10fcc70b4ec4f01ab386922742e9136 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell <benjamin.maxwell@arm.com> Date: Tue, 4 Nov 2025 13:39:20 +0000 Subject: [PATCH 164/313] [AArch64][SME] Support `aarch64-split-sve-objects` with VLAs/realignment (#163816) This was left out of the original patch (#142392) to simplify the initial implementation. However, after refactoring the SVE prologue/epilogue code in #162253, it's not much of an extension to support this case. The main change here is when restoring the SP from the FP for the SVE restores, we may need an additional frame offset to move from the start of the ZPR callee-saves to the start of the PPR callee-saves. This patch also fixes a previously latent bug where we'd add the `RealignmentPadding` when allocating the PPR locals, then again for the ZPR locals. This was unnecessary as the stack only needs to be realigned after all SVE allocations. --- .../Target/AArch64/AArch64FrameLowering.cpp | 8 +- .../AArch64/AArch64PrologueEpilogue.cpp | 63 +- .../Target/AArch64/AArch64PrologueEpilogue.h | 4 + .../CodeGen/AArch64/framelayout-split-sve.mir | 49 +- .../AArch64/split-sve-stack-frame-layout.ll | 338 ++- llvm/test/CodeGen/AArch64/stack-hazard.ll | 1828 ++++++++++------- 6 files changed, 1491 insertions(+), 799 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 0f7b34c36055f..3ee4d58ca892c 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2380,13 +2380,6 @@ void AArch64FrameLowering::determineStackHazardSlot( return; } - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) { - LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable " - "sized objects or realignment\n"); - return; - } - // If another calling convention is explicitly set FPRs can't be promoted to // ZPR callee-saves. if (!is_contained({CallingConv::C, CallingConv::Fast, @@ -2402,6 +2395,7 @@ void AArch64FrameLowering::determineStackHazardSlot( assert(Subtarget.isSVEorStreamingSVEAvailable() && "Expected SVE to be available for PPRs"); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); // With SplitSVEObjects the CS hazard padding is placed between the // PPRs and ZPRs. If there are any FPR CS there would be a hazard between // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs. diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 45b7120112af2..4df4d54e60c95 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -805,7 +805,7 @@ void AArch64PrologueEmitter::emitPrologue() { CFAOffset += SVEAllocs.BeforePPRs; assert(PPRRange.End == ZPRRange.Begin && "Expected ZPR callee saves after PPR locals"); - allocateStackSpace(PPRRange.End, RealignmentPadding, SVEAllocs.AfterPPRs, + allocateStackSpace(PPRRange.End, 0, SVEAllocs.AfterPPRs, EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs); CFAOffset += SVEAllocs.AfterPPRs; @@ -1318,6 +1318,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF, SEHEpilogueStartI = MBB.end(); } +void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI, + StackOffset Offset) { + // Other combinations could be supported, but are not currently needed. + assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 && + "expected negative offset (with optional fixed portion)"); + Register Base = AArch64::FP; + if (int64_t FixedOffset = Offset.getFixed()) { + // If we have a negative fixed offset, we need to first subtract it in a + // temporary register first (to avoid briefly deallocating the scalable + // portion of the offset). + Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP, + StackOffset::getFixed(FixedOffset), TII, + MachineInstr::FrameDestroy); + } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base, + StackOffset::getScalable(Offset.getScalable()), TII, + MachineInstr::FrameDestroy); +} + void AArch64EpilogueEmitter::emitEpilogue() { MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr(); if (MBB.end() != EpilogueEndI) { @@ -1418,6 +1438,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { AfterCSRPopSize += ProloguePopSize; } } + // Move past the restores of the callee-saved registers. // If we plan on combining the sp bump of the local stack size and the callee // save stack size, we might need to adjust the CSR save and restore offsets. @@ -1483,7 +1504,6 @@ void AArch64EpilogueEmitter::emitEpilogue() { StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR}); - MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin; // Deallocate the SVE area. if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { @@ -1510,28 +1530,25 @@ void AArch64EpilogueEmitter::emitEpilogue() { (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP : AArch64::SP; if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) { - // TODO: Support stack realigment and variable-sized objects. - assert( - SVELayout != SVEStackLayout::Split && - "unexpected stack realignment or variable sized objects with split " - "SVE stack objects"); - - Register CalleeSaveBase = AArch64::FP; - if (int64_t CalleeSaveBaseOffset = - AFI->getCalleeSaveBaseToFrameRecordOffset()) { - // If we have have an non-zero offset to the non-SVE CS base we need to - // compute the base address by subtracting the offest in a temporary - // register first (to avoid briefly deallocating the SVE CS). - CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); - emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, - StackOffset::getFixed(-CalleeSaveBaseOffset), TII, - MachineInstr::FrameDestroy); + if (ZPR.CalleeSavesSize || SVELayout != SVEStackLayout::Split) { + // The offset from the frame-pointer to the start of the ZPR saves. + StackOffset FPOffsetZPR = + -SVECalleeSavesSize - PPR.LocalsSize - + StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset()); + // Deallocate the stack space space by moving the SP to the start of the + // ZPR/PPR callee-save area. + moveSPBelowFP(ZPRRange.Begin, FPOffsetZPR); + } + // With split SVE, the predicates are stored in a separate area above the + // ZPR saves, so we must adjust the stack to the start of the PPRs. + if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) { + // The offset from the frame-pointer to the start of the PPR saves. + StackOffset FPOffsetPPR = -PPR.CalleeSavesSize; + // Move to the start of the PPR area. + assert(!FPOffsetPPR.getFixed() && "expected only scalable offset"); + emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::FP, + FPOffsetPPR, TII, MachineInstr::FrameDestroy); } - // The code below will deallocate the stack space space by moving the SP - // to the start of the SVE callee-save area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, - -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy); } else if (BaseForSVEDealloc == AArch64::SP) { auto NonSVELocals = StackOffset::getFixed(NumBytes); auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) + diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h index 6e0e28324a0ac..7f297b5d337b0 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h @@ -180,6 +180,10 @@ class AArch64EpilogueEmitter final : public AArch64PrologueEpilogueCommon { private: bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const; + /// A helper for moving the SP to a negative offset from the FP, without + /// deallocating any stack in the range FP to FP + Offset. + void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset); + void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const; diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir index f535e0fe8b387..bb7ffb47d8dfe 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir @@ -162,63 +162,54 @@ body: | RET_ReallyLR # CHECK-LABEL: name: test_allocate_split_sve_realigned -# CHECK: stackSize: 2080 +# CHECK: stackSize: 1056 # CHECK: bb.0.entry: # CHECK: liveins: $z0, $p0, $lr -# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 -# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5) -# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4) -# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0 +# CHECK: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.5), (store (s64) into %stack.4) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0 -# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg -# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 2064, 0 +# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $x9, -3, implicit $vg +# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]], 7930 # # CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 # CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg -# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0) -# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 -# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1) +# CHECK-NEXT: STR_ZXI $z0, killed $x8, -2 :: (store (<vscale x 1 x s128>) into %stack.0) +# CHECK-NEXT: STR_PXI $p0, $fp, -6 :: (store (<vscale x 1 x s16>) into %stack.1) # -# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0 -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040 -# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4) -# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5) -# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 +# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.5), (load (s64) from %stack.4) # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 # CHECK-NEXT: RET_ReallyLR # ASM-LABEL: test_allocate_split_sve_realigned -# ASM: sub sp, sp, #1040 -# ASM-NEXT: .cfi_def_cfa_offset 1040 -# ASM-NEXT: str x29, [sp, #1024] -# ASM-NEXT: str x30, [sp, #1032] -# ASM-NEXT: add x29, sp, #1024 +# ASM: stp x29, x30, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: mov x29, sp # ASM-NEXT: .cfi_def_cfa w29, 16 # ASM-NEXT: .cfi_offset w30, -8 # ASM-NEXT: .cfi_offset w29, -16 # -# ASM: sub sp, x29, #1024 -# ASM-NEXT: .cfi_def_cfa wsp, 1040 -# ASM-NEXT: ldr x30, [sp, #1032] -# ASM-NEXT: ldr x29, [sp, #1024] -# ASM-NEXT: add sp, sp, #1040 +# ASM: mov sp, x29 +# ASM-NEXT: .cfi_def_cfa wsp, 16 +# ASM-NEXT: ldp x29, x30, [sp], #16 # ASM-NEXT: .cfi_def_cfa_offset 0 # ASM-NEXT: .cfi_restore w30 # ASM-NEXT: .cfi_restore w29 -# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 # UNWINDINFO: DW_CFA_def_cfa: reg29 +16 # UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 # -# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040 +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 # UNWINDINFO: DW_CFA_def_cfa_offset: +0 # UNWINDINFO-NEXT: DW_CFA_restore: reg30 # UNWINDINFO-NEXT: DW_CFA_restore: reg29 diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll index c13dd33865c37..f65aec6665cec 100644 --- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll @@ -737,36 +737,23 @@ entry: } declare ptr @memset(ptr, i32, i32) -; FIXME: aarch64-split-sve-objects is currently not supported in this function -; as it requires stack reealignment (for the 32-byte aligned alloca). -; GPR CSRs -; <hazard padding> -; FPR CSRs -; <hazrd padding> -; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here! -; <realignment padding> -; -> sp define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: zpr_and_ppr_local_realignment: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #1040 -; CHECK-NEXT: sub x9, sp, #1040 -; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK-NEXT: add x29, sp, #1024 +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #2064 +; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: addvl x9, x9, #-2 -; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill ; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: sub x8, x29, #1024 -; CHECK-NEXT: str p0, [x8, #-1, mul vl] +; CHECK-NEXT: str p0, [x29, #-1, mul vl] ; CHECK-NEXT: str z0, [x8, #-2, mul vl] ; CHECK-NEXT: str x0, [sp] -; CHECK-NEXT: sub sp, x29, #1024 -; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %ppr_local = alloca <vscale x 16 x i1> %zpr_local = alloca <vscale x 16 x i8> @@ -805,3 +792,316 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x store volatile i64 %gpr, ptr %gpr_local ret void } + +; Only PPR callee-saves + a VLA +; Expect: No hazard padding. Frame pointer (x29), p4-p6 callee saves allocated +; with `addvl #-1`, PPR saves restored using frame pointer `addvl sp, x29, #-1`. +define aarch64_sve_vector_pcs void @only_ppr_csr_vla(i64 %n) { +; CHECK-LABEL: only_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + call void (...) @llvm.fake.use(ptr %alloc) + tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"() + ret void +} + +; Only ZPR callee-saves + a VLA +; Expect: Hazard padding, Frame pointer (x29), z8-z10 callee saves allocated +; with `addvl #-3`. ZPR saves restored from `FP - 1024 + addvl #-3`. +define aarch64_sve_vector_pcs void @only_zpr_csr_vla(i64 %n) { +; CHECK-LABEL: only_zpr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #1056 +; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #1024 +; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #1040] // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, x8, #-3 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sub sp, x29, #1024 +; CHECK-NEXT: ldr x19, [sp, #1040] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1056 +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + call void (...) @llvm.fake.use(ptr %alloc) + tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"() + ret void +} + +; PPR+ZPR callee-saves + a VLA +; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10) +; callee-saves allocated separately, with hazard padding of 1024 between the +; areas. ZPR callee saves restored by `FP - 1024 + addvl #-4`, PPR callee saves +; restored by `FP + addvl #-1`. +define aarch64_sve_vector_pcs void @zpr_ppr_csr_vla(i64 %n) { +; CHECK-LABEL: zpr_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, x8, #-4 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + call void (...) @llvm.fake.use(ptr %alloc) + tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"() + ret void +} + +; Only PPR callee-saves (and ZPR/PPR locals) + a VLA +; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) callee-saves, with +; hazard padding after the PPR callee saves (1024) and after the FPR local area +; (1024) -- coeleased to 2048. Only PPRs restored by moving the SP to +; `FP + addvl #-1`. +define void @sve_locals_only_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) { +; CHECK-LABEL: sve_locals_only_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #2048 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-9, mul vl] +; CHECK-NEXT: str z0, [x8, #-3, mul vl] +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"() + call void (...) @llvm.fake.use(ptr %alloc) + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} + +; Only ZPR callee-saves (and ZPR/PPR locals) + a VLA +; Expect: Hazard padding, Frame pointer (x29), ZPR (z8-z10) callee-saves, with +; hazard padding before the ZPR callee saves (1024) and after the ZPR local area +; (1024). Only ZPRs restored by moving the SP to `FP - 1024 + addvl #-4`. +define void @sve_locals_only_zpr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) { +; CHECK-LABEL: sve_locals_only_zpr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str z0, [x8, #-5, mul vl] +; CHECK-NEXT: addvl sp, x8, #-4 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"() + call void (...) @llvm.fake.use(ptr %alloc) + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} + +; PPR+ZPR callee-saves (and ZPR/PPR locals) + a VLA +; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10) +; callee-saves, with hazard padding before the ZPR callee saves (1024) and after +; the ZPR local area (1024). ZPRs restored by moving the SP to +; `FP - 1024 + addvl #-5`, PPRs restored by moving SP to `FP + addvl #-1`. +define void @sve_locals_zpr_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) { +; CHECK-LABEL: sve_locals_zpr_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-9, mul vl] +; CHECK-NEXT: str z0, [x8, #-6, mul vl] +; CHECK-NEXT: addvl sp, x8, #-5 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"() + call void (...) @llvm.fake.use(ptr %alloc) + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index bdee359487ce6..70874761b82ab 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -3512,14 +3512,13 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; ; CHECK64-LABEL: svecc_call_dynamic_alloca: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: .cfi_def_cfa_offset 128 +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 64 ; CHECK64-NEXT: cntd x9 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 +; CHECK64-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: mov x29, sp ; CHECK64-NEXT: .cfi_def_cfa w29, 64 ; CHECK64-NEXT: .cfi_offset w19, -8 ; CHECK64-NEXT: .cfi_offset w20, -16 @@ -3529,7 +3528,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: .cfi_offset vg, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -3542,30 +3541,32 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128 ; CHECK64-NEXT: sub sp, sp, #64 ; CHECK64-NEXT: mov x19, sp ; CHECK64-NEXT: mov w2, w1 @@ -3595,22 +3596,31 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: sub x8, x29, #64 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: addvl sp, x8, #-18 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, x29, #-2 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 ; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload @@ -3623,21 +3633,12 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: .cfi_restore z8 -; CHECK64-NEXT: .cfi_restore z9 -; CHECK64-NEXT: .cfi_restore z10 -; CHECK64-NEXT: .cfi_restore z11 -; CHECK64-NEXT: .cfi_restore z12 -; CHECK64-NEXT: .cfi_restore z13 -; CHECK64-NEXT: .cfi_restore z14 -; CHECK64-NEXT: .cfi_restore z15 -; CHECK64-NEXT: sub sp, x29, #64 -; CHECK64-NEXT: .cfi_def_cfa wsp, 128 -; CHECK64-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload -; CHECK64-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload -; CHECK64-NEXT: ldp x27, x26, [sp, #96] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: .cfi_def_cfa wsp, 64 +; CHECK64-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: .cfi_def_cfa_offset 0 ; CHECK64-NEXT: .cfi_restore w19 ; CHECK64-NEXT: .cfi_restore w20 @@ -3649,305 +3650,444 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call_dynamic_alloca: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1088 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -8 -; CHECK1024-NEXT: .cfi_offset w20, -16 -; CHECK1024-NEXT: .cfi_offset w26, -24 -; CHECK1024-NEXT: .cfi_offset w27, -32 -; CHECK1024-NEXT: .cfi_offset w28, -40 -; CHECK1024-NEXT: .cfi_offset vg, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: mov x19, sp -; CHECK1024-NEXT: mov w2, w1 -; CHECK1024-NEXT: mov w8, w0 -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: mov w8, w8 -; CHECK1024-NEXT: mov x9, sp -; CHECK1024-NEXT: mov x20, x0 -; CHECK1024-NEXT: add x8, x8, #15 -; CHECK1024-NEXT: and x8, x8, #0x1fffffff0 -; CHECK1024-NEXT: sub x8, x9, x8 -; CHECK1024-NEXT: mov sp, x8 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: tbz w20, #0, .LBB35_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB35_2: // %entry -; CHECK1024-NEXT: mov x0, x8 -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w20, #0, .LBB35_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB35_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: sub x8, x29, #1024 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: addvl sp, x8, #-18 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: sub sp, x29, #1024 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088 -; CHECK1024-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w20 -; CHECK1024-NEXT: .cfi_restore w26 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret -entry: - %ptr = alloca i8, i32 %P1 - tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 - %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2) - ret i32 -396142473 -} - - -define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" { -; CHECK0-LABEL: svecc_call_realign: -; CHECK0: // %bb.0: // %entry -; CHECK0-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill -; CHECK0-NEXT: .cfi_def_cfa_offset 64 -; CHECK0-NEXT: cntd x9 -; CHECK0-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill -; CHECK0-NEXT: str x9, [sp, #16] // 8-byte Folded Spill -; CHECK0-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill -; CHECK0-NEXT: mov x29, sp -; CHECK0-NEXT: .cfi_def_cfa w29, 64 -; CHECK0-NEXT: .cfi_offset w19, -8 -; CHECK0-NEXT: .cfi_offset w26, -16 -; CHECK0-NEXT: .cfi_offset w27, -24 -; CHECK0-NEXT: .cfi_offset w28, -32 -; CHECK0-NEXT: .cfi_offset vg, -48 -; CHECK0-NEXT: .cfi_offset w30, -56 -; CHECK0-NEXT: .cfi_offset w29, -64 -; CHECK0-NEXT: addvl sp, sp, #-18 -; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64 -; CHECK0-NEXT: sub x9, sp, #1024 -; CHECK0-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK0-NEXT: mov w2, w1 -; CHECK0-NEXT: bl __arm_sme_state -; CHECK0-NEXT: mov x19, x0 -; CHECK0-NEXT: //APP -; CHECK0-NEXT: //NO_APP -; CHECK0-NEXT: tbz w19, #0, .LBB36_2 -; CHECK0-NEXT: // %bb.1: // %entry -; CHECK0-NEXT: smstop sm -; CHECK0-NEXT: .LBB36_2: // %entry -; CHECK0-NEXT: mov x0, sp -; CHECK0-NEXT: mov w1, #45 // =0x2d -; CHECK0-NEXT: bl memset -; CHECK0-NEXT: tbz w19, #0, .LBB36_4 -; CHECK0-NEXT: // %bb.3: // %entry -; CHECK0-NEXT: smstart sm -; CHECK0-NEXT: .LBB36_4: // %entry -; CHECK0-NEXT: mov w0, #22647 // =0x5877 -; CHECK0-NEXT: movk w0, #59491, lsl #16 -; CHECK0-NEXT: addvl sp, x29, #-18 -; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: .cfi_restore z8 -; CHECK0-NEXT: .cfi_restore z9 -; CHECK0-NEXT: .cfi_restore z10 -; CHECK0-NEXT: .cfi_restore z11 -; CHECK0-NEXT: .cfi_restore z12 -; CHECK0-NEXT: .cfi_restore z13 -; CHECK0-NEXT: .cfi_restore z14 -; CHECK0-NEXT: .cfi_restore z15 -; CHECK0-NEXT: mov sp, x29 -; CHECK0-NEXT: .cfi_def_cfa wsp, 64 -; CHECK0-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload -; CHECK0-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload -; CHECK0-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload -; CHECK0-NEXT: .cfi_def_cfa_offset 0 -; CHECK0-NEXT: .cfi_restore w19 -; CHECK0-NEXT: .cfi_restore w26 -; CHECK0-NEXT: .cfi_restore w27 -; CHECK0-NEXT: .cfi_restore w28 -; CHECK0-NEXT: .cfi_restore vg -; CHECK0-NEXT: .cfi_restore w30 -; CHECK0-NEXT: .cfi_restore w29 -; CHECK0-NEXT: ret -; -; CHECK64-LABEL: svecc_call_realign: -; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: .cfi_def_cfa_offset 128 -; CHECK64-NEXT: cntd x9 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 -; CHECK64-NEXT: .cfi_def_cfa w29, 64 -; CHECK64-NEXT: .cfi_offset w19, -16 -; CHECK64-NEXT: .cfi_offset w26, -24 -; CHECK64-NEXT: .cfi_offset w27, -32 -; CHECK64-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_alloca: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: mov x19, sp +; CHECK1024-NOSPLITSVE-NEXT: mov w2, w1 +; CHECK1024-NOSPLITSVE-NEXT: mov w8, w0 +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: mov w8, w8 +; CHECK1024-NOSPLITSVE-NEXT: mov x9, sp +; CHECK1024-NOSPLITSVE-NEXT: mov x20, x0 +; CHECK1024-NOSPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-NOSPLITSVE-NEXT: and x8, x8, #0x1fffffff0 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x9, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov sp, x8 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: tbz w20, #0, .LBB35_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB35_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w20, #0, .LBB35_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB35_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w20 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_alloca: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: mov x19, sp +; CHECK1024-SPLITSVE-NEXT: mov w2, w1 +; CHECK1024-SPLITSVE-NEXT: mov w8, w0 +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: mov w8, w8 +; CHECK1024-SPLITSVE-NEXT: mov x9, sp +; CHECK1024-SPLITSVE-NEXT: mov x20, x0 +; CHECK1024-SPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-SPLITSVE-NEXT: and x8, x8, #0x1fffffff0 +; CHECK1024-SPLITSVE-NEXT: sub x8, x9, x8 +; CHECK1024-SPLITSVE-NEXT: mov sp, x8 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: tbz w20, #0, .LBB35_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB35_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, x8 +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w20, #0, .LBB35_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB35_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: mov sp, x29 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64 +; CHECK1024-SPLITSVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w20 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret +entry: + %ptr = alloca i8, i32 %P1 + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2) + ret i32 -396142473 +} + + +define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_call_realign: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 64 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK0-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: mov x29, sp +; CHECK0-NEXT: .cfi_def_cfa w29, 64 +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w26, -16 +; CHECK0-NEXT: .cfi_offset w27, -24 +; CHECK0-NEXT: .cfi_offset w28, -32 +; CHECK0-NEXT: .cfi_offset vg, -48 +; CHECK0-NEXT: .cfi_offset w30, -56 +; CHECK0-NEXT: .cfi_offset w29, -64 +; CHECK0-NEXT: addvl sp, sp, #-18 +; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64 +; CHECK0-NEXT: sub x9, sp, #1024 +; CHECK0-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK0-NEXT: mov w2, w1 +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: mov x19, x0 +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: tbz w19, #0, .LBB36_2 +; CHECK0-NEXT: // %bb.1: // %entry +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB36_2: // %entry +; CHECK0-NEXT: mov x0, sp +; CHECK0-NEXT: mov w1, #45 // =0x2d +; CHECK0-NEXT: bl memset +; CHECK0-NEXT: tbz w19, #0, .LBB36_4 +; CHECK0-NEXT: // %bb.3: // %entry +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB36_4: // %entry +; CHECK0-NEXT: mov w0, #22647 // =0x5877 +; CHECK0-NEXT: movk w0, #59491, lsl #16 +; CHECK0-NEXT: addvl sp, x29, #-18 +; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: .cfi_restore z8 +; CHECK0-NEXT: .cfi_restore z9 +; CHECK0-NEXT: .cfi_restore z10 +; CHECK0-NEXT: .cfi_restore z11 +; CHECK0-NEXT: .cfi_restore z12 +; CHECK0-NEXT: .cfi_restore z13 +; CHECK0-NEXT: .cfi_restore z14 +; CHECK0-NEXT: .cfi_restore z15 +; CHECK0-NEXT: mov sp, x29 +; CHECK0-NEXT: .cfi_def_cfa wsp, 64 +; CHECK0-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w26 +; CHECK0-NEXT: .cfi_restore w27 +; CHECK0-NEXT: .cfi_restore w28 +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore w29 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_call_realign: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 64 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK64-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: mov x29, sp +; CHECK64-NEXT: .cfi_def_cfa w29, 64 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w26, -16 +; CHECK64-NEXT: .cfi_offset w27, -24 +; CHECK64-NEXT: .cfi_offset w28, -32 ; CHECK64-NEXT: .cfi_offset vg, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -3960,30 +4100,32 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128 ; CHECK64-NEXT: sub x9, sp, #1088 ; CHECK64-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK64-NEXT: mov w2, w1 @@ -4006,22 +4148,31 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: sub x8, x29, #64 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: addvl sp, x8, #-18 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, x29, #-2 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 ; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload @@ -4034,20 +4185,11 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: .cfi_restore z8 -; CHECK64-NEXT: .cfi_restore z9 -; CHECK64-NEXT: .cfi_restore z10 -; CHECK64-NEXT: .cfi_restore z11 -; CHECK64-NEXT: .cfi_restore z12 -; CHECK64-NEXT: .cfi_restore z13 -; CHECK64-NEXT: .cfi_restore z14 -; CHECK64-NEXT: .cfi_restore z15 -; CHECK64-NEXT: sub sp, x29, #64 -; CHECK64-NEXT: .cfi_def_cfa wsp, 128 -; CHECK64-NEXT: ldp x26, x19, [sp, #104] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x28, x27, [sp, #88] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: .cfi_def_cfa wsp, 64 +; CHECK64-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: .cfi_def_cfa_offset 0 ; CHECK64-NEXT: .cfi_restore w19 ; CHECK64-NEXT: .cfi_restore w26 @@ -4058,140 +4200,270 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call_realign: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1088 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w26, -24 -; CHECK1024-NEXT: .cfi_offset w27, -32 -; CHECK1024-NEXT: .cfi_offset w28, -40 -; CHECK1024-NEXT: .cfi_offset vg, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 -; CHECK1024-NEXT: sub x9, sp, #2048 -; CHECK1024-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK1024-NEXT: mov w2, w1 -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: mov x19, x0 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: tbz w19, #0, .LBB36_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB36_2: // %entry -; CHECK1024-NEXT: mov x0, sp -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w19, #0, .LBB36_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB36_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: sub x8, x29, #1024 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: addvl sp, x8, #-18 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: sub sp, x29, #1024 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088 -; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w26 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_call_realign: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: sub x9, sp, #2048 +; CHECK1024-NOSPLITSVE-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK1024-NOSPLITSVE-NEXT: mov w2, w1 +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB36_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB36_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, sp +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB36_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB36_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call_realign: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: sub x9, sp, #2048 +; CHECK1024-SPLITSVE-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK1024-SPLITSVE-NEXT: mov w2, w1 +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: mov x19, x0 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB36_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB36_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, sp +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB36_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB36_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: mov sp, x29 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64 +; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret entry: %ptr = alloca i8, i32 1000, align 32 tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 @@ -4311,13 +4583,12 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; ; CHECK64-LABEL: svecc_call_dynamic_and_scalable_alloca: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 -; CHECK64-NEXT: stp x28, x27, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x26, x20, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK64-NEXT: mov x29, sp +; CHECK64-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -4330,41 +4601,43 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill ; CHECK64-NEXT: sub sp, sp, #112 ; CHECK64-NEXT: addvl sp, sp, #-1 ; CHECK64-NEXT: mov x19, sp ; CHECK64-NEXT: .cfi_def_cfa w29, 64 -; CHECK64-NEXT: .cfi_offset w19, -16 -; CHECK64-NEXT: .cfi_offset w20, -24 -; CHECK64-NEXT: .cfi_offset w26, -32 -; CHECK64-NEXT: .cfi_offset w27, -40 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w20, -16 +; CHECK64-NEXT: .cfi_offset w26, -24 +; CHECK64-NEXT: .cfi_offset w27, -32 ; CHECK64-NEXT: .cfi_offset w28, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * VG - 128 ; CHECK64-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK64-NEXT: ubfiz x8, x0, #2, #32 ; CHECK64-NEXT: mov x9, sp @@ -4385,22 +4658,23 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; CHECK64-NEXT: sub x8, x29, #64 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: addvl sp, x8, #-18 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, x29, #-2 ; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload @@ -4413,131 +4687,243 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: sub sp, x29, #64 -; CHECK64-NEXT: ldp x20, x19, [sp, #104] // 16-byte Folded Reload -; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK64-NEXT: ldp x27, x26, [sp, #88] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x30, x28, [sp, #72] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call_dynamic_and_scalable_alloca: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x20, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1072 -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: mov x19, sp -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w20, -24 -; CHECK1024-NEXT: .cfi_offset w26, -32 -; CHECK1024-NEXT: .cfi_offset w27, -40 -; CHECK1024-NEXT: .cfi_offset w28, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088 -; CHECK1024-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK1024-NEXT: ubfiz x8, x0, #2, #32 -; CHECK1024-NEXT: mov x9, sp -; CHECK1024-NEXT: add x8, x8, #15 -; CHECK1024-NEXT: and x8, x8, #0x7fffffff0 -; CHECK1024-NEXT: sub x20, x9, x8 -; CHECK1024-NEXT: mov sp, x20 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: add x0, x19, #8 -; CHECK1024-NEXT: bl bar -; CHECK1024-NEXT: sub x0, x29, #1024 -; CHECK1024-NEXT: addvl x0, x0, #-19 -; CHECK1024-NEXT: bl bar -; CHECK1024-NEXT: mov x0, x20 -; CHECK1024-NEXT: bl bar -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: sub x8, x29, #1024 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: addvl sp, x8, #-18 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: sub sp, x29, #1024 -; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x20, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x20, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-NOSPLITSVE-NEXT: mov x19, sp +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w20, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK1024-NOSPLITSVE-NEXT: ubfiz x8, x0, #2, #32 +; CHECK1024-NOSPLITSVE-NEXT: mov x9, sp +; CHECK1024-NOSPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-NOSPLITSVE-NEXT: and x8, x8, #0x7fffffff0 +; CHECK1024-NOSPLITSVE-NEXT: sub x20, x9, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov sp, x20 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: add x0, x19, #8 +; CHECK1024-NOSPLITSVE-NEXT: bl bar +; CHECK1024-NOSPLITSVE-NEXT: sub x0, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: addvl x0, x0, #-19 +; CHECK1024-NOSPLITSVE-NEXT: bl bar +; CHECK1024-NOSPLITSVE-NEXT: mov x0, x20 +; CHECK1024-NOSPLITSVE-NEXT: bl bar +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x20, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-SPLITSVE-NEXT: mov x19, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK1024-SPLITSVE-NEXT: ubfiz x8, x0, #2, #32 +; CHECK1024-SPLITSVE-NEXT: mov x9, sp +; CHECK1024-SPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-SPLITSVE-NEXT: and x8, x8, #0x7fffffff0 +; CHECK1024-SPLITSVE-NEXT: sub x20, x9, x8 +; CHECK1024-SPLITSVE-NEXT: mov sp, x20 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: add x0, x19, #8 +; CHECK1024-SPLITSVE-NEXT: bl bar +; CHECK1024-SPLITSVE-NEXT: sub x0, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl x0, x0, #-19 +; CHECK1024-SPLITSVE-NEXT: bl bar +; CHECK1024-SPLITSVE-NEXT: mov x0, x20 +; CHECK1024-SPLITSVE-NEXT: bl bar +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: mov sp, x29 +; CHECK1024-SPLITSVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ret entry: %a = alloca i32, i32 10 %b = alloca <vscale x 4 x i32> From 389aa46a99405c5c003c309ce1b0defbdebf6b23 Mon Sep 17 00:00:00 2001 From: Fabian Mora <fmora.dev@gmail.com> Date: Tue, 4 Nov 2025 09:12:35 -0500 Subject: [PATCH 165/313] [mlir][DataFlow] Add visitBlockTransfer hook to dense analyses (#166263) Add a customizable `visitBlockTransfer` method to dense forward and backward dataflow analyses, allowing subclasses to customize lattice propagation behavior along control flow edges between blocks. Default implementation preserves existing join/meet semantics. This change mirrors the exiting structure of both dense dataflow classes, where `RegionBranchOpInterface` and callables are allowed to be customized by subclasses. The use case motivating this change is dense liveness analysis. Currently, without the customization hook the block transfer function produces incorrect results. The issue is the current logic doesn't remove the successor block arguments from the live set, as it only meets the successor state with the predecessor state (ie. set union). With this change is now possible to compute the correct result by specifying the correct logic in `visitBlockTransfer`. Signed-off-by: Fabian Mora <fabian.mora-cordero@amd.com> --- .../mlir/Analysis/DataFlow/DenseAnalysis.h | 69 +++++++++++++++++++ mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp | 9 ++- 2 files changed, 75 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h index 3c87c453a4cf0..5b7b45fdd1d58 100644 --- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h @@ -127,6 +127,18 @@ class AbstractDenseForwardDataFlowAnalysis : public DataFlowAnalysis { /// them into the same equivalent class. virtual void buildOperationEquivalentLatticeAnchor(Operation *op) {} + /// Visit a block and propagate the dense lattice forward along the control + /// flow edge from predecessor to block. `point` corresponds to the program + /// point before `block`. The default implementation merges in the state from + /// the predecessor's terminator. + virtual void visitBlockTransfer(Block *block, ProgramPoint *point, + Block *predecessor, + const AbstractDenseLattice &before, + AbstractDenseLattice *after) { + // Merge in the state from the predecessor's terminator. + join(after, before); + } + /// Propagate the dense lattice forward along the control flow edge from /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt` /// values correspond to control flow branches originating at or targeting the @@ -259,6 +271,22 @@ class DenseForwardDataFlowAnalysis branch, regionFrom, regionTo, before, after); } + /// Hook for customizing the behavior of lattice propagation along the control + /// flow edges between blocks. The control flows from `predecessor` to + /// `block`. The lattice is propagated forward along this edge. The lattices + /// are as follows: + /// - `before` is the lattice at the end of the predecessor block; + /// - `after` is the lattice at the beginning of the block. + /// By default, the `after` state is simply joined with the `before` state. + /// Concrete analyses can override this behavior or delegate to the parent + /// call for the default behavior. + virtual void visitBlockTransfer(Block *block, ProgramPoint *point, + Block *predecessor, const LatticeT &before, + LatticeT *after) { + AbstractDenseForwardDataFlowAnalysis::visitBlockTransfer( + block, point, predecessor, before, after); + } + protected: /// Get the dense lattice on this lattice anchor. LatticeT *getLattice(LatticeAnchor anchor) override { @@ -306,6 +334,13 @@ class DenseForwardDataFlowAnalysis static_cast<const LatticeT &>(before), static_cast<LatticeT *>(after)); } + void visitBlockTransfer(Block *block, ProgramPoint *point, Block *predecessor, + const AbstractDenseLattice &before, + AbstractDenseLattice *after) final { + visitBlockTransfer(block, point, predecessor, + static_cast<const LatticeT &>(before), + static_cast<LatticeT *>(after)); + } }; //===----------------------------------------------------------------------===// @@ -388,6 +423,17 @@ class AbstractDenseBackwardDataFlowAnalysis : public DataFlowAnalysis { /// them into the same equivalent class. virtual void buildOperationEquivalentLatticeAnchor(Operation *op) {} + /// Visit a block and propagate the dense lattice backward along the control + /// flow edge from successor to block. `point` corresponds to the program + /// point after `block`. The default implementation merges in the state from + /// the successor's first operation or the block itself when empty. + virtual void visitBlockTransfer(Block *block, ProgramPoint *point, + Block *successor, + const AbstractDenseLattice &after, + AbstractDenseLattice *before) { + meet(before, after); + } + /// Propagate the dense lattice backwards along the control flow edge from /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt` /// values correspond to control flow branches originating at or targeting the @@ -531,6 +577,22 @@ class DenseBackwardDataFlowAnalysis branch, regionFrom, regionTo, after, before); } + /// Hook for customizing the behavior of lattice propagation along the control + /// flow edges between blocks. The control flows from `successor` to + /// `block`. The lattice is propagated back along this edge. The lattices + /// are as follows: + /// - `after` is the lattice at the beginning of the successor block; + /// - `before` is the lattice at the end of the block. + /// By default, the `before` state is simply met with the `after` state. + /// Concrete analyses can override this behavior or delegate to the parent + /// call for the default behavior. + virtual void visitBlockTransfer(Block *block, ProgramPoint *point, + Block *successor, const LatticeT &after, + LatticeT *before) { + AbstractDenseBackwardDataFlowAnalysis::visitBlockTransfer( + block, point, successor, after, before); + } + protected: /// Get the dense lattice at the given lattice anchor. LatticeT *getLattice(LatticeAnchor anchor) override { @@ -577,6 +639,13 @@ class DenseBackwardDataFlowAnalysis static_cast<const LatticeT &>(after), static_cast<LatticeT *>(before)); } + void visitBlockTransfer(Block *block, ProgramPoint *point, Block *successor, + const AbstractDenseLattice &after, + AbstractDenseLattice *before) final { + visitBlockTransfer(block, point, successor, + static_cast<const LatticeT &>(after), + static_cast<LatticeT *>(before)); + } }; } // end namespace dataflow diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp index 0682e5f26785a..22bc0b32a9bd1 100644 --- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp @@ -266,9 +266,10 @@ void AbstractDenseForwardDataFlowAnalysis::visitBlock(Block *block) { } LDBG() << " Joining state from predecessor " << predecessor; + const AbstractDenseLattice &before = *getLatticeFor( + point, getProgramPointAfter(predecessor->getTerminator())); // Merge in the state from the predecessor's terminator. - join(after, *getLatticeFor( - point, getProgramPointAfter(predecessor->getTerminator()))); + visitBlockTransfer(block, point, predecessor, before, after); } } @@ -614,7 +615,9 @@ void AbstractDenseBackwardDataFlowAnalysis::visitBlock(Block *block) { LDBG() << " Meeting state from successor " << successor; // Merge in the state from the successor: either the first operation, or the // block itself when empty. - meet(before, *getLatticeFor(point, getProgramPointBefore(successor))); + visitBlockTransfer(block, point, successor, + *getLatticeFor(point, getProgramPointBefore(successor)), + before); } } From ed45c0571eb35339f7c3562edbb3b27d67594acd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tuomas=20K=C3=A4rn=C3=A4?= <tuomas.karna@intel.com> Date: Tue, 4 Nov 2025 16:22:00 +0200 Subject: [PATCH 166/313] [MLIR][XeGPU] fix load/store/prefetch op offset verifier (#166137) The verifier of `xegpu.{load/store/prefetch}_nd` op fails if `offset` a mix of static and dynamic values, e.g. `offset = [0, %c0]`. In this case the length of dynamic offsets is 1 and the check `offsetSize != tDescRank` (=2) fails. Instead, we should check the length of `getMixedOffsets()`. --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 21 ++++++--------------- mlir/test/Dialect/XeGPU/ops.mlir | 9 +++++++++ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index c8f5c86c03686..fb51077b5dff3 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -498,11 +498,8 @@ LogicalResult PrefetchNdOp::verify() { return emitOpError("invalid l3_hint: ") << getL3HintAttr(); int64_t tDescRank = tdescTy.getRank(); - int64_t offsetSize = static_cast<int64_t>(getOffsets().size()); - int64_t constOffsetSize = - getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0; - if (((offsetSize != 0) && (offsetSize != tDescRank)) || - ((constOffsetSize != 0) && (constOffsetSize != tDescRank))) + int64_t offsetSize = getMixedOffsets().size(); + if (offsetSize != 0 && offsetSize != tDescRank) return emitOpError( "Mismatched ranks between offsets and tensor descriptor"); @@ -623,11 +620,8 @@ LogicalResult LoadNdOp::verify() { << tdescTy; int64_t tDescRank = tdescTy.getRank(); - int64_t offsetSize = static_cast<int64_t>(getOffsets().size()); - int64_t constOffsetSize = - getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0; - if (((offsetSize != 0) && (offsetSize != tDescRank)) || - ((constOffsetSize != 0) && (constOffsetSize != tDescRank))) + int64_t offsetSize = getMixedOffsets().size(); + if (offsetSize != 0 && offsetSize != tDescRank) return emitOpError( "Mismatched ranks between offsets and tensor descriptor"); @@ -717,11 +711,8 @@ LogicalResult StoreNdOp::verify() { << dstTy; int64_t tDescRank = dstTy.getRank(); - int64_t offsetSize = static_cast<int64_t>(getOffsets().size()); - int64_t constOffsetSize = - getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0; - if (((offsetSize != 0) && (offsetSize != tDescRank)) || - ((constOffsetSize != 0) && (constOffsetSize != tDescRank))) + int64_t offsetSize = getMixedOffsets().size(); + if (offsetSize != 0 && offsetSize != tDescRank) return emitOpError( "Mismatched ranks between offsets and tensor descriptor"); diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index 0a10f6814ae96..9b3829664108d 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -278,6 +278,15 @@ gpu.func @subgroup_load_nd_offset_1(%src: memref<24x32xf32>, %x : index, %y : in gpu.return } +// CHECK: func @subgroup_load_nd_offset_2(%[[arg0:.*]]: memref<24x32xf32>, %arg1: index) { +gpu.func @subgroup_load_nd_offset_2(%src: memref<24x32xf32>, %x : index) { + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][%arg1, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> + %2 = xegpu.load_nd %1[%x, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> + gpu.return +} + // CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> From a01e4da6d64320bf308ef1596ec4d630772616c9 Mon Sep 17 00:00:00 2001 From: Lei Huang <lei@ca.ibm.com> Date: Tue, 4 Nov 2025 09:29:26 -0500 Subject: [PATCH 167/313] [PowerPC] Ensure correct codgen for MMA functions for cpu=future (#165791) Update MMA tests to add run line for `cpu=future` to ensure MMA functionality is not broken with the new `wacc` register classes introduced. Previous commit have added def for using the new `wacc` registers, this just add in testing and fixes a few patterns that was missing . --- llvm/lib/Target/PowerPC/PPCInstrMMA.td | 4 +- .../CodeGen/PowerPC/mma-acc-copy-hints.ll | 91 ++ llvm/test/CodeGen/PowerPC/mma-acc-memops.ll | 170 +++ llvm/test/CodeGen/PowerPC/mma-acc-spill.ll | 102 ++ .../mma-integer-based-outer-product.ll | 166 +++ llvm/test/CodeGen/PowerPC/mma-intrinsics.ll | 517 +++++++ .../test/CodeGen/PowerPC/mma-outer-product.ll | 1266 +++++++++++++++++ llvm/test/CodeGen/PowerPC/mma-phi-accs.ll | 202 +++ .../PowerPC/peephole-mma-phi-liveness.ll | 39 + 9 files changed, 2555 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td index b38dd4ae948c6..fc3cde3f464bb 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td @@ -202,7 +202,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, RegConstraint<"@earlyclobber $AT">; def PM#NAME#WPP : MMIRR_XX3Form_XY4P2_XAB6< - opcode, !or(xo, 0x20), (outs acc:$AT), + opcode, !or(xo, 0x20), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), @@ -765,7 +765,7 @@ let Predicates = [MMA, IsISAFuture] in { def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), (XVF64GERWPN $ATi, $XA, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)), - (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>; + (XVF64GERWNP $ATi, $XA, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), (XVF64GERWNN $ATi, $XA, RCCp.BToVSRC)>; diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll index 7e2f744ac1d71..94121f09e36be 100644 --- a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c) local_unnamed_addr #0 { ; CHECK-LABEL: testMultiply: @@ -91,6 +97,91 @@ define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture nound ; CHECK-BE-NEXT: ld r30, -16(r1) ; CHECK-BE-NEXT: mtlr r0 ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: testMultiply: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: mflr r0 +; CHECK-LE-WACC-NEXT: std r30, -16(r1) +; CHECK-LE-WACC-NEXT: std r0, 16(r1) +; CHECK-LE-WACC-NEXT: clrldi r0, r1, 59 +; CHECK-LE-WACC-NEXT: subfic r0, r0, -128 +; CHECK-LE-WACC-NEXT: mr r30, r1 +; CHECK-LE-WACC-NEXT: stdux r1, r1, r0 +; CHECK-LE-WACC-NEXT: stxv v30, -64(r30) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v31, -48(r30) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: lxv v31, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v30, 0(r4) +; CHECK-LE-WACC-NEXT: addi r3, r1, 32 +; CHECK-LE-WACC-NEXT: std r29, -24(r30) # 8-byte Folded Spill +; CHECK-LE-WACC-NEXT: vmr v2, v31 +; CHECK-LE-WACC-NEXT: vmr v3, v30 +; CHECK-LE-WACC-NEXT: mr r29, r5 +; CHECK-LE-WACC-NEXT: bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_@notoc +; CHECK-LE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-LE-WACC-NEXT: xvf32gerpp wacc0, v31, v30 +; CHECK-LE-WACC-NEXT: lxv vs0, 48(r1) +; CHECK-LE-WACC-NEXT: lxv vs1, 32(r1) +; CHECK-LE-WACC-NEXT: xvf32gerpp wacc0, vs1, vs0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v5, 0(r29) +; CHECK-LE-WACC-NEXT: pstxv v4, 8(r29), 0 +; CHECK-LE-WACC-NEXT: stxv v3, 16(r29) +; CHECK-LE-WACC-NEXT: pstxv v2, 24(r29), 0 +; CHECK-LE-WACC-NEXT: lxv v31, -48(r30) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v30, -64(r30) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: ld r29, -24(r30) # 8-byte Folded Reload +; CHECK-LE-WACC-NEXT: mr r1, r30 +; CHECK-LE-WACC-NEXT: ld r0, 16(r1) +; CHECK-LE-WACC-NEXT: ld r30, -16(r1) +; CHECK-LE-WACC-NEXT: mtlr r0 +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testMultiply: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: mflr r0 +; CHECK-BE-WACC-NEXT: std r30, -16(r1) +; CHECK-BE-WACC-NEXT: std r0, 16(r1) +; CHECK-BE-WACC-NEXT: clrldi r0, r1, 59 +; CHECK-BE-WACC-NEXT: subfic r0, r0, -224 +; CHECK-BE-WACC-NEXT: mr r30, r1 +; CHECK-BE-WACC-NEXT: stdux r1, r1, r0 +; CHECK-BE-WACC-NEXT: stxv v30, -64(r30) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v31, -48(r30) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: lxv v31, 0(r3) +; CHECK-BE-WACC-NEXT: lxv v30, 0(r4) +; CHECK-BE-WACC-NEXT: addi r3, r1, 128 +; CHECK-BE-WACC-NEXT: std r29, -24(r30) # 8-byte Folded Spill +; CHECK-BE-WACC-NEXT: vmr v2, v31 +; CHECK-BE-WACC-NEXT: vmr v3, v30 +; CHECK-BE-WACC-NEXT: mr r29, r5 +; CHECK-BE-WACC-NEXT: bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_ +; CHECK-BE-WACC-NEXT: nop +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v31, v30 +; CHECK-BE-WACC-NEXT: lxv vs0, 128(r1) +; CHECK-BE-WACC-NEXT: lxv vs1, 144(r1) +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, vs0, vs1 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: vmr v1, v2 +; CHECK-BE-WACC-NEXT: vmr v7, v4 +; CHECK-BE-WACC-NEXT: vmr v0, v3 +; CHECK-BE-WACC-NEXT: vmr v6, v5 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp38, vsp32, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r29) +; CHECK-BE-WACC-NEXT: pstxv v3, 8(r29), 0 +; CHECK-BE-WACC-NEXT: stxv v4, 16(r29) +; CHECK-BE-WACC-NEXT: pstxv v5, 24(r29), 0 +; CHECK-BE-WACC-NEXT: lxv v31, -48(r30) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v30, -64(r30) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: ld r29, -24(r30) # 8-byte Folded Reload +; CHECK-BE-WACC-NEXT: mr r1, r30 +; CHECK-BE-WACC-NEXT: ld r0, 16(r1) +; CHECK-BE-WACC-NEXT: ld r30, -16(r1) +; CHECK-BE-WACC-NEXT: mtlr r0 +; CHECK-BE-WACC-NEXT: blr entry: %vP = alloca <256 x i1>, align 32 call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %vP) diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll index 059d60a9608f8..bc5d5bed36e9b 100644 --- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll @@ -3,10 +3,18 @@ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ ; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s \ ; RUN: --check-prefix=LE-PAIRED +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s \ +; RUN: --check-prefix=LE-PAIRED-WACC ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \ ; RUN: FileCheck %s --check-prefix=BE-PAIRED +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \ +; RUN: FileCheck %s --check-prefix=BE-PAIRED-WACC ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \ ; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \ ; RUN: | FileCheck %s --check-prefix=LE-PWR9 @@ -36,6 +44,20 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: pstxv vs3, f@PCREL+128(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testLdSt: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv v3, f@PCREL+64(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v5, f@PCREL+96(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v2, f@PCREL+80(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v4, f@PCREL+112(0), 1 +; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; LE-PAIRED-WACC-NEXT: pstxv v4, f@PCREL+176(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v5, f@PCREL+160(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v2, f@PCREL+144(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v3, f@PCREL+128(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testLdSt: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, f@toc@ha @@ -50,6 +72,22 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs2, 160(r3) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testLdSt: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, f@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, f@toc@l +; BE-PAIRED-WACC-NEXT: lxv v3, 112(r3) +; BE-PAIRED-WACC-NEXT: lxv v5, 80(r3) +; BE-PAIRED-WACC-NEXT: lxv v2, 96(r3) +; BE-PAIRED-WACC-NEXT: lxv v4, 64(r3) +; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; BE-PAIRED-WACC-NEXT: stxv v5, 176(r3) +; BE-PAIRED-WACC-NEXT: stxv v4, 160(r3) +; BE-PAIRED-WACC-NEXT: stxv v3, 144(r3) +; BE-PAIRED-WACC-NEXT: stxv v2, 128(r3) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testLdSt: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, f@toc@ha @@ -147,6 +185,25 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: stxv vs2, 16(r4) ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testXLdSt: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: paddi r5, 0, f@PCREL, 1 +; LE-PAIRED-WACC-NEXT: sldi r3, r3, 6 +; LE-PAIRED-WACC-NEXT: add r6, r5, r3 +; LE-PAIRED-WACC-NEXT: lxvx v3, r5, r3 +; LE-PAIRED-WACC-NEXT: lxv v2, 16(r6) +; LE-PAIRED-WACC-NEXT: lxv v5, 32(r6) +; LE-PAIRED-WACC-NEXT: lxv v4, 48(r6) +; LE-PAIRED-WACC-NEXT: sldi r3, r4, 6 +; LE-PAIRED-WACC-NEXT: add r4, r5, r3 +; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; LE-PAIRED-WACC-NEXT: stxvx v3, r5, r3 +; LE-PAIRED-WACC-NEXT: stxv v4, 48(r4) +; LE-PAIRED-WACC-NEXT: stxv v5, 32(r4) +; LE-PAIRED-WACC-NEXT: stxv v2, 16(r4) +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testXLdSt: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r5, r2, f@toc@ha @@ -165,6 +222,26 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs2, 32(r4) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testXLdSt: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r5, r2, f@toc@ha +; BE-PAIRED-WACC-NEXT: addi r5, r5, f@toc@l +; BE-PAIRED-WACC-NEXT: sldi r3, r3, 6 +; BE-PAIRED-WACC-NEXT: add r6, r5, r3 +; BE-PAIRED-WACC-NEXT: lxvx v2, r5, r3 +; BE-PAIRED-WACC-NEXT: lxv v5, 48(r6) +; BE-PAIRED-WACC-NEXT: lxv v3, 16(r6) +; BE-PAIRED-WACC-NEXT: lxv v4, 32(r6) +; BE-PAIRED-WACC-NEXT: sldi r3, r4, 6 +; BE-PAIRED-WACC-NEXT: add r4, r5, r3 +; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; BE-PAIRED-WACC-NEXT: stxvx v2, r5, r3 +; BE-PAIRED-WACC-NEXT: stxv v5, 48(r4) +; BE-PAIRED-WACC-NEXT: stxv v4, 32(r4) +; BE-PAIRED-WACC-NEXT: stxv v3, 16(r4) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testXLdSt: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r5, r2, f@toc@ha @@ -263,6 +340,20 @@ define dso_local void @testUnalignedLdSt() { ; LE-PAIRED-NEXT: pstxv vs3, f@PCREL+19(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testUnalignedLdSt: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv v3, f@PCREL+11(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v5, f@PCREL+43(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v2, f@PCREL+27(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v4, f@PCREL+59(0), 1 +; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; LE-PAIRED-WACC-NEXT: pstxv v4, f@PCREL+67(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v5, f@PCREL+51(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v2, f@PCREL+35(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v3, f@PCREL+19(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testUnalignedLdSt: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, f@toc@ha @@ -277,6 +368,22 @@ define dso_local void @testUnalignedLdSt() { ; BE-PAIRED-NEXT: pstxv vs2, 51(r3), 0 ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testUnalignedLdSt: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, f@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, f@toc@l +; BE-PAIRED-WACC-NEXT: plxv v3, 59(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv v5, 27(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv v2, 43(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv v4, 11(r3), 0 +; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; BE-PAIRED-WACC-NEXT: pstxv v5, 67(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv v4, 51(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv v3, 35(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv v2, 19(r3), 0 +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testUnalignedLdSt: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, f@toc@ha @@ -381,6 +488,14 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: pstxv vs1, g@PCREL+64(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testLdStPair: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv vs0, g@PCREL+48(0), 1 +; LE-PAIRED-WACC-NEXT: plxv vs1, g@PCREL+32(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs0, g@PCREL+80(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs1, g@PCREL+64(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testLdStPair: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, g@toc@ha @@ -391,6 +506,16 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs0, 64(r3) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testLdStPair: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, g@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, g@toc@l +; BE-PAIRED-WACC-NEXT: lxv vs0, 48(r3) +; BE-PAIRED-WACC-NEXT: lxv vs1, 32(r3) +; BE-PAIRED-WACC-NEXT: stxv vs0, 80(r3) +; BE-PAIRED-WACC-NEXT: stxv vs1, 64(r3) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testLdStPair: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, g@toc@ha @@ -460,6 +585,19 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: stxv vs1, 16(r4) ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testXLdStPair: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: sldi r3, r3, 5 +; LE-PAIRED-WACC-NEXT: paddi r5, 0, g@PCREL, 1 +; LE-PAIRED-WACC-NEXT: add r6, r5, r3 +; LE-PAIRED-WACC-NEXT: lxvx vs0, r5, r3 +; LE-PAIRED-WACC-NEXT: lxv vs1, 16(r6) +; LE-PAIRED-WACC-NEXT: sldi r3, r4, 5 +; LE-PAIRED-WACC-NEXT: add r4, r5, r3 +; LE-PAIRED-WACC-NEXT: stxvx vs0, r5, r3 +; LE-PAIRED-WACC-NEXT: stxv vs1, 16(r4) +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testXLdStPair: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r5, r2, g@toc@ha @@ -474,6 +612,20 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs1, 16(r4) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testXLdStPair: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r5, r2, g@toc@ha +; BE-PAIRED-WACC-NEXT: sldi r3, r3, 5 +; BE-PAIRED-WACC-NEXT: addi r5, r5, g@toc@l +; BE-PAIRED-WACC-NEXT: add r6, r5, r3 +; BE-PAIRED-WACC-NEXT: lxvx vs0, r5, r3 +; BE-PAIRED-WACC-NEXT: lxv vs1, 16(r6) +; BE-PAIRED-WACC-NEXT: sldi r3, r4, 5 +; BE-PAIRED-WACC-NEXT: add r4, r5, r3 +; BE-PAIRED-WACC-NEXT: stxvx vs0, r5, r3 +; BE-PAIRED-WACC-NEXT: stxv vs1, 16(r4) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testXLdStPair: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r5, r2, g@toc@ha @@ -548,6 +700,14 @@ define dso_local void @testUnalignedLdStPair() { ; LE-PAIRED-NEXT: pstxv vs1, g@PCREL+19(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testUnalignedLdStPair: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv vs0, g@PCREL+27(0), 1 +; LE-PAIRED-WACC-NEXT: plxv vs1, g@PCREL+11(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs0, g@PCREL+35(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs1, g@PCREL+19(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testUnalignedLdStPair: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, g@toc@ha @@ -558,6 +718,16 @@ define dso_local void @testUnalignedLdStPair() { ; BE-PAIRED-NEXT: pstxv vs0, 19(r3), 0 ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testUnalignedLdStPair: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, g@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, g@toc@l +; BE-PAIRED-WACC-NEXT: plxv vs0, 27(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv vs1, 11(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv vs0, 35(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv vs1, 19(r3), 0 +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testUnalignedLdStPair: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, g@toc@ha diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll index abc65bed5bf6c..9db8ba1c9eb09 100644 --- a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll @@ -13,6 +13,13 @@ ; RUN: -mcpu=pwr11 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC + declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>) declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) declare void @foo() @@ -119,6 +126,101 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-BE-NEXT: ld r0, 16(r1) ; CHECK-BE-NEXT: mtlr r0 ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: intrinsics1: +; CHECK-LE-WACC: # %bb.0: +; CHECK-LE-WACC-NEXT: mflr r0 +; CHECK-LE-WACC-NEXT: std r0, 16(r1) +; CHECK-LE-WACC-NEXT: stdu r1, -176(r1) +; CHECK-LE-WACC-NEXT: .cfi_def_cfa_offset 176 +; CHECK-LE-WACC-NEXT: .cfi_offset lr, 16 +; CHECK-LE-WACC-NEXT: .cfi_offset r30, -16 +; CHECK-LE-WACC-NEXT: .cfi_offset v28, -80 +; CHECK-LE-WACC-NEXT: .cfi_offset v29, -64 +; CHECK-LE-WACC-NEXT: .cfi_offset v30, -48 +; CHECK-LE-WACC-NEXT: .cfi_offset v31, -32 +; CHECK-LE-WACC-NEXT: stxv v28, 96(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v29, 112(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v30, 128(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v31, 144(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: vmr v31, v5 +; CHECK-LE-WACC-NEXT: vmr v29, v3 +; CHECK-LE-WACC-NEXT: vmr v30, v4 +; CHECK-LE-WACC-NEXT: vmr v28, v2 +; CHECK-LE-WACC-NEXT: std r30, 160(r1) # 8-byte Folded Spill +; CHECK-LE-WACC-NEXT: ld r30, 272(r1) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp60, vsp62, 0 +; CHECK-LE-WACC-NEXT: xvf16ger2pp wacc0, v2, v4 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxvp vsp36, 64(r1) +; CHECK-LE-WACC-NEXT: stxvp vsp34, 32(r1) +; CHECK-LE-WACC-NEXT: bl foo@notoc +; CHECK-LE-WACC-NEXT: lxvp vsp34, 64(r1) +; CHECK-LE-WACC-NEXT: lxvp vsp36, 32(r1) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-LE-WACC-NEXT: xvf16ger2pp wacc0, v28, v30 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r30) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r30) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r30) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r30) +; CHECK-LE-WACC-NEXT: lxv v31, 144(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v30, 128(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v29, 112(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v28, 96(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: ld r30, 160(r1) # 8-byte Folded Reload +; CHECK-LE-WACC-NEXT: addi r1, r1, 176 +; CHECK-LE-WACC-NEXT: ld r0, 16(r1) +; CHECK-LE-WACC-NEXT: mtlr r0 +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: intrinsics1: +; CHECK-BE-WACC: # %bb.0: +; CHECK-BE-WACC-NEXT: mflr r0 +; CHECK-BE-WACC-NEXT: std r0, 16(r1) +; CHECK-BE-WACC-NEXT: stdu r1, -256(r1) +; CHECK-BE-WACC-NEXT: .cfi_def_cfa_offset 256 +; CHECK-BE-WACC-NEXT: .cfi_offset lr, 16 +; CHECK-BE-WACC-NEXT: .cfi_offset r30, -16 +; CHECK-BE-WACC-NEXT: .cfi_offset v28, -80 +; CHECK-BE-WACC-NEXT: .cfi_offset v29, -64 +; CHECK-BE-WACC-NEXT: .cfi_offset v30, -48 +; CHECK-BE-WACC-NEXT: .cfi_offset v31, -32 +; CHECK-BE-WACC-NEXT: stxv v28, 176(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v29, 192(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v30, 208(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v31, 224(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: vmr v31, v5 +; CHECK-BE-WACC-NEXT: vmr v29, v3 +; CHECK-BE-WACC-NEXT: vmr v30, v4 +; CHECK-BE-WACC-NEXT: vmr v28, v2 +; CHECK-BE-WACC-NEXT: std r30, 240(r1) # 8-byte Folded Spill +; CHECK-BE-WACC-NEXT: ld r30, 368(r1) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp60, vsp62, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v2, v4 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxvp vsp36, 112(r1) +; CHECK-BE-WACC-NEXT: stxvp vsp34, 144(r1) +; CHECK-BE-WACC-NEXT: bl foo +; CHECK-BE-WACC-NEXT: nop +; CHECK-BE-WACC-NEXT: lxvp vsp34, 112(r1) +; CHECK-BE-WACC-NEXT: lxvp vsp36, 144(r1) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v28, v30 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r30) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r30) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r30) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r30) +; CHECK-BE-WACC-NEXT: lxv v31, 224(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v30, 208(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v29, 192(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v28, 176(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: ld r30, 240(r1) # 8-byte Folded Reload +; CHECK-BE-WACC-NEXT: addi r1, r1, 256 +; CHECK-BE-WACC-NEXT: ld r0, 16(r1) +; CHECK-BE-WACC-NEXT: mtlr r0 +; CHECK-BE-WACC-NEXT: blr %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4) %2 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc3) tail call void @foo() diff --git a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll index e932aec2c7134..7b36fa4f64f71 100644 --- a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC ; Function Attrs: nofree nounwind writeonly define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone %vpp, <16 x i8> %vc, ptr nocapture %resp) { @@ -27,6 +33,26 @@ define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test1: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: xvi16ger2 wacc0, v2, v2 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi16ger2 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -57,6 +83,26 @@ define dso_local void @test2(ptr nocapture readnone %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test2: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: pmxvi16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -97,6 +143,36 @@ define dso_local void @test3(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test3: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: xvi8ger4spp wacc0, v2, v2 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi8ger4spp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -138,6 +214,36 @@ define dso_local void @test4(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test4: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: xvi16ger2pp wacc0, v2, v2 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test4: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi16ger2pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -179,6 +285,36 @@ define dso_local void @test5(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test5: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test5: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -220,6 +356,36 @@ define dso_local void @test6(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test6: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test6: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll index 8fbc9d785796d..3505cbb197bf9 100644 --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC ; assemble_acc declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) @@ -32,6 +38,28 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: ass_acc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: ass_acc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %ptr, align 64 @@ -66,6 +94,28 @@ define void @int_xxmtacc(ptr %ptr, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: int_xxmtacc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: int_xxmtacc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: ; One xxmtacc is generated from the call to assemble.acc then one xxmtacc is ; generated from the call to xxmtacc then one xxmfacc is generated for the store @@ -101,6 +151,28 @@ define void @int_xxmfacc(ptr %ptr, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: int_xxmfacc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: int_xxmfacc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: ; One xxmtacc is generated from the call to assemble.acc then one xxmfacc is ; generated from the call to xxmfacc then one xxmfacc is generated for the store @@ -132,6 +204,26 @@ define void @int_xxsetaccz(ptr %ptr) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: int_xxsetaccz: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: int_xxsetaccz: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() store <512 x i1> %0, ptr %ptr, align 64 @@ -160,6 +252,26 @@ define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) { ; CHECK-BE-NEXT: stxv vs2, 0(r5) ; CHECK-BE-NEXT: stxv vs3, 0(r6) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: disass_acc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 0(r4) +; CHECK-WACC-NEXT: stxv v3, 0(r5) +; CHECK-WACC-NEXT: stxv v2, 0(r6) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: disass_acc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 0(r4) +; CHECK-BE-WACC-NEXT: stxv v4, 0(r5) +; CHECK-BE-WACC-NEXT: stxv v5, 0(r6) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0) @@ -219,6 +331,50 @@ define void @testBranch(ptr %ptr, <16 x i8> %vc, i32 %val) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testBranch: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmplwi r7, 0 +; CHECK-WACC-NEXT: beq cr0, .LBB5_2 +; CHECK-WACC-NEXT: # %bb.1: # %if.then +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: b .LBB5_3 +; CHECK-WACC-NEXT: .LBB5_2: # %if.else +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-WACC-NEXT: .LBB5_3: # %if.end +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testBranch: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmplwi r7, 0 +; CHECK-BE-WACC-NEXT: beq cr0, .LBB5_2 +; CHECK-BE-WACC-NEXT: # %bb.1: # %if.then +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: b .LBB5_3 +; CHECK-BE-WACC-NEXT: .LBB5_2: # %if.else +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: .LBB5_3: # %if.end +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: %tobool = icmp eq i32 %val, 0 br i1 %tobool, label %if.else, label %if.then @@ -273,6 +429,36 @@ define void @testcse(ptr %res, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs2, 96(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 112(r3) +; CHECK-WACC-NEXT: stxv v5, 96(r3) +; CHECK-WACC-NEXT: stxv v2, 80(r3) +; CHECK-WACC-NEXT: stxv v3, 64(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 112(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() @@ -320,6 +506,42 @@ define void @testcse2(ptr %res, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs2, 96(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r3) +; CHECK-WACC-NEXT: stxv v5, 96(r3) +; CHECK-WACC-NEXT: stxv v2, 80(r3) +; CHECK-WACC-NEXT: stxv v3, 64(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() @@ -367,6 +589,42 @@ define void @testcse3(ptr %res, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs2, 96(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse3: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r3) +; CHECK-WACC-NEXT: stxv v5, 96(r3) +; CHECK-WACC-NEXT: stxv v2, 80(r3) +; CHECK-WACC-NEXT: stxv v3, 64(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -475,6 +733,104 @@ define void @testcse4(ptr %res, i32 %lim, ptr %vc) { ; CHECK-BE-NEXT: bdnz .LBB9_2 ; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse4: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmpwi r4, 1 +; CHECK-WACC-NEXT: bltlr cr0 +; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-WACC-NEXT: clrldi r4, r4, 32 +; CHECK-WACC-NEXT: mtctr r4 +; CHECK-WACC-NEXT: li r4, 0 +; CHECK-WACC-NEXT: li r6, 0 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB9_2: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: rldic r7, r6, 4, 28 +; CHECK-WACC-NEXT: add r8, r5, r7 +; CHECK-WACC-NEXT: lxvx vs0, r5, r7 +; CHECK-WACC-NEXT: lxv vs1, 16(r8) +; CHECK-WACC-NEXT: dmxxsetaccz wacc2 +; CHECK-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc2, vs0, vs1 +; CHECK-WACC-NEXT: lxv vs0, 32(r8) +; CHECK-WACC-NEXT: lxv vs1, 48(r8) +; CHECK-WACC-NEXT: rldic r7, r4, 6, 26 +; CHECK-WACC-NEXT: addi r4, r4, 3 +; CHECK-WACC-NEXT: addi r6, r6, 6 +; CHECK-WACC-NEXT: xvf32gerpn wacc1, vs0, vs1 +; CHECK-WACC-NEXT: lxv vs0, 64(r8) +; CHECK-WACC-NEXT: lxv vs1, 80(r8) +; CHECK-WACC-NEXT: add r8, r3, r7 +; CHECK-WACC-NEXT: xvf32gernp wacc0, vs0, vs1 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc2, 0 +; CHECK-WACC-NEXT: stxvx v3, r3, r7 +; CHECK-WACC-NEXT: stxv v4, 48(r8) +; CHECK-WACC-NEXT: stxv v5, 32(r8) +; CHECK-WACC-NEXT: stxv v2, 16(r8) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r8) +; CHECK-WACC-NEXT: stxv v5, 96(r8) +; CHECK-WACC-NEXT: stxv v2, 80(r8) +; CHECK-WACC-NEXT: stxv v3, 64(r8) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 176(r8) +; CHECK-WACC-NEXT: stxv v5, 160(r8) +; CHECK-WACC-NEXT: stxv v2, 144(r8) +; CHECK-WACC-NEXT: stxv v3, 128(r8) +; CHECK-WACC-NEXT: bdnz .LBB9_2 +; CHECK-WACC-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse4: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmpwi r4, 1 +; CHECK-BE-WACC-NEXT: bltlr cr0 +; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-WACC-NEXT: clrldi r4, r4, 32 +; CHECK-BE-WACC-NEXT: mtctr r4 +; CHECK-BE-WACC-NEXT: li r4, 0 +; CHECK-BE-WACC-NEXT: li r6, 0 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB9_2: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: rldic r7, r6, 4, 28 +; CHECK-BE-WACC-NEXT: add r8, r5, r7 +; CHECK-BE-WACC-NEXT: lxvx vs0, r5, r7 +; CHECK-BE-WACC-NEXT: lxv vs1, 16(r8) +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc2 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc2, vs0, vs1 +; CHECK-BE-WACC-NEXT: lxv vs0, 32(r8) +; CHECK-BE-WACC-NEXT: lxv vs1, 48(r8) +; CHECK-BE-WACC-NEXT: rldic r7, r4, 6, 26 +; CHECK-BE-WACC-NEXT: addi r4, r4, 3 +; CHECK-BE-WACC-NEXT: addi r6, r6, 6 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc1, vs0, vs1 +; CHECK-BE-WACC-NEXT: lxv vs0, 64(r8) +; CHECK-BE-WACC-NEXT: lxv vs1, 80(r8) +; CHECK-BE-WACC-NEXT: add r8, r3, r7 +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, vs0, vs1 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc2, 0 +; CHECK-BE-WACC-NEXT: stxvx v2, r3, r7 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r8) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r8) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r8) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r8) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r8) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r8) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r8) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 176(r8) +; CHECK-BE-WACC-NEXT: stxv v4, 160(r8) +; CHECK-BE-WACC-NEXT: stxv v3, 144(r8) +; CHECK-BE-WACC-NEXT: stxv v2, 128(r8) +; CHECK-BE-WACC-NEXT: bdnz .LBB9_2 +; CHECK-BE-WACC-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: blr entry: %cmp55 = icmp sgt i32 %lim, 0 br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup @@ -600,6 +956,71 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind { ; CHECK-BE-NEXT: ld r0, 16(r1) ; CHECK-BE-NEXT: mtlr r0 ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testRedundantPrimeUnprime: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: mflr r0 +; CHECK-WACC-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-WACC-NEXT: std r0, 16(r1) +; CHECK-WACC-NEXT: stdu r1, -112(r1) +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0 +; CHECK-WACC-NEXT: stxv v0, 48(r3) +; CHECK-WACC-NEXT: stxv v1, 32(r3) +; CHECK-WACC-NEXT: stxv v4, 16(r3) +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-WACC-NEXT: mr r30, r3 +; CHECK-WACC-NEXT: stxvp vsp36, 64(r1) +; CHECK-WACC-NEXT: stxvp vsp34, 32(r1) +; CHECK-WACC-NEXT: bl testRedundantPrimeUnprimeF@notoc +; CHECK-WACC-NEXT: lxvp vsp34, 64(r1) +; CHECK-WACC-NEXT: lxvp vsp36, 32(r1) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r30) +; CHECK-WACC-NEXT: stxv v5, 96(r30) +; CHECK-WACC-NEXT: stxv v2, 80(r30) +; CHECK-WACC-NEXT: stxv v3, 64(r30) +; CHECK-WACC-NEXT: addi r1, r1, 112 +; CHECK-WACC-NEXT: ld r0, 16(r1) +; CHECK-WACC-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-WACC-NEXT: mtlr r0 +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testRedundantPrimeUnprime: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: mflr r0 +; CHECK-BE-WACC-NEXT: std r0, 16(r1) +; CHECK-BE-WACC-NEXT: stdu r1, -192(r1) +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: std r30, 176(r1) # 8-byte Folded Spill +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v1, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v0, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 0(r3) +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-BE-WACC-NEXT: mr r30, r3 +; CHECK-BE-WACC-NEXT: stxvp vsp36, 112(r1) +; CHECK-BE-WACC-NEXT: stxvp vsp34, 144(r1) +; CHECK-BE-WACC-NEXT: bl testRedundantPrimeUnprimeF +; CHECK-BE-WACC-NEXT: nop +; CHECK-BE-WACC-NEXT: lxvp vsp34, 112(r1) +; CHECK-BE-WACC-NEXT: lxvp vsp36, 144(r1) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r30) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r30) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r30) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r30) +; CHECK-BE-WACC-NEXT: ld r30, 176(r1) # 8-byte Folded Reload +; CHECK-BE-WACC-NEXT: addi r1, r1, 192 +; CHECK-BE-WACC-NEXT: ld r0, 16(r1) +; CHECK-BE-WACC-NEXT: mtlr r0 +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() store <512 x i1> %0, ptr %dst, align 64 @@ -646,6 +1067,38 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test_ldst_1: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: plxvp vsp36, 8(r4), 0 +; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test_ldst_1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: plxvp vsp36, 8(r4), 0 +; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = getelementptr i8, ptr %vpp, i64 8 @@ -688,6 +1141,38 @@ define void @test_ldst_2(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test_ldst_2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxvp vsp36, 0(r4) +; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test_ldst_2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxvp vsp36, 0(r4) +; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp) @@ -729,6 +1214,38 @@ define void @test_ldst_3(ptr nocapture readonly %vqp, i64 %offs, ptr %vpp, <16 x ; CHECK-BE-NEXT: stxv vs3, 48(r9) ; CHECK-BE-NEXT: stxv vs2, 32(r9) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test_ldst_3: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxvp vsp36, 0(r5) +; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r9) +; CHECK-WACC-NEXT: stxv v5, 32(r9) +; CHECK-WACC-NEXT: stxv v2, 16(r9) +; CHECK-WACC-NEXT: stxv v3, 0(r9) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test_ldst_3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxvp vsp36, 0(r5) +; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r9) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r9) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r9) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r9) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp) diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll index ac6ad41633492..ff860b8d6ff22 100644 --- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -enable-subreg-liveness -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) @@ -56,6 +62,46 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: intrinsics1: +; CHECK-WACC: # %bb.0: +; CHECK-WACC-NEXT: vmr v1, v4 +; CHECK-WACC-NEXT: vmr v4, v3 +; CHECK-WACC-NEXT: vmr v0, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v4 +; CHECK-WACC-NEXT: ld r3, 96(r1) +; CHECK-WACC-NEXT: xvf16ger2pp wacc0, v0, v1 +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: vmr v2, v5 +; CHECK-WACC-NEXT: pmxvf32gerpn wacc0, v4, v5, 0, 0 +; CHECK-WACC-NEXT: pmxvf64gernp wacc0, vsp34, v0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: intrinsics1: +; CHECK-BE-WACC: # %bb.0: +; CHECK-BE-WACC-NEXT: vmr v1, v4 +; CHECK-BE-WACC-NEXT: vmr v4, v3 +; CHECK-BE-WACC-NEXT: vmr v0, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v4 +; CHECK-BE-WACC-NEXT: ld r3, 112(r1) +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v0, v1 +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: vmr v2, v5 +; CHECK-BE-WACC-NEXT: pmxvf32gerpn wacc0, v4, v5, 0, 0 +; CHECK-BE-WACC-NEXT: pmxvf64gernp wacc0, vsp34, v0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc3, <16 x i8> %vc2, <16 x i8> %vc4) %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2) %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3) @@ -115,6 +161,46 @@ define void @intrinsics2(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4, ptr %ptr) { ; CHECK-BE-NEXT: stxv vs2, 0(r5) ; CHECK-BE-NEXT: stxv vs3, 0(r6) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: intrinsics2: +; CHECK-WACC: # %bb.0: +; CHECK-WACC-NEXT: lxv v2, 0(r3) +; CHECK-WACC-NEXT: lxv v4, 0(r5) +; CHECK-WACC-NEXT: lxv v3, 0(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r6) +; CHECK-WACC-NEXT: vmr v1, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-WACC-NEXT: xvi8ger4pp wacc0, v2, v3 +; CHECK-WACC-NEXT: xvf16ger2pn wacc0, v2, v4 +; CHECK-WACC-NEXT: vmr v0, v5 +; CHECK-WACC-NEXT: pmxvf32gernn wacc0, v3, v5, 0, 0 +; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp32, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 0(r4) +; CHECK-WACC-NEXT: stxv v3, 0(r5) +; CHECK-WACC-NEXT: stxv v2, 0(r6) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: intrinsics2: +; CHECK-BE-WACC: # %bb.0: +; CHECK-BE-WACC-NEXT: lxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 0(r5) +; CHECK-BE-WACC-NEXT: lxv v3, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 0(r6) +; CHECK-BE-WACC-NEXT: vmr v1, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi8ger4pp wacc0, v2, v3 +; CHECK-BE-WACC-NEXT: xvf16ger2pn wacc0, v2, v4 +; CHECK-BE-WACC-NEXT: vmr v0, v5 +; CHECK-BE-WACC-NEXT: pmxvf32gernn wacc0, v3, v5, 0, 0 +; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp32, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 0(r4) +; CHECK-BE-WACC-NEXT: stxv v4, 0(r5) +; CHECK-BE-WACC-NEXT: stxv v5, 0(r6) +; CHECK-BE-WACC-NEXT: blr %vc1 = load <16 x i8>, ptr %ptr1, align 16 %vc2 = load <16 x i8>, ptr %ptr2, align 16 %vc3 = load <16 x i8>, ptr %ptr3, align 16 @@ -157,6 +243,26 @@ define void @test1(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test1: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvi4ger8 wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi4ger8 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -196,6 +302,36 @@ define void @test2(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -226,6 +362,26 @@ define void @test3(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test3: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvi4ger8 wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi4ger8 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -265,6 +421,36 @@ define void @test4(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test4: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test4: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -295,6 +481,26 @@ define void @test5(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test5: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvi8ger4 wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test5: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi8ger4 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -334,6 +540,36 @@ define void @test6(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test6: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi8ger4pp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test6: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi8ger4pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -364,6 +600,26 @@ define void @test7(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test7: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvi8ger4 wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test7: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi8ger4 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -403,6 +659,36 @@ define void @test8(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test8: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test8: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -433,6 +719,26 @@ define void @test9(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test9: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvi16ger2s wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test9: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi16ger2s wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -472,6 +778,36 @@ define void @test10(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test10: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi16ger2spp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test10: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi16ger2spp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -502,6 +838,26 @@ define void @test11(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test11: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvi16ger2s wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test11: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi16ger2s wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -541,6 +897,36 @@ define void @test12(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test12: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test12: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -571,6 +957,26 @@ define void @test13(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test13: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvf16ger2 wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test13: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvf16ger2 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -610,6 +1016,36 @@ define void @test14(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test14: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2pp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test14: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -650,6 +1086,36 @@ define void @test15(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test15: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2pn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test15: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -690,6 +1156,36 @@ define void @test16(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test16: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2np wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test16: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2np wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -730,6 +1226,36 @@ define void @test17(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test17: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2nn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test17: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2nn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -760,6 +1286,26 @@ define void @test18(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test18: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvf16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test18: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvf16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -799,6 +1345,36 @@ define void @test19(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test19: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test19: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -839,6 +1415,36 @@ define void @test20(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test20: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test20: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -879,6 +1485,36 @@ define void @test21(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test21: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2np wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test21: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2np wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -919,6 +1555,36 @@ define void @test22(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test22: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test22: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -949,6 +1615,26 @@ define void @test23(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test23: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvf32ger wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test23: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvf32ger wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -988,6 +1674,36 @@ define void @test24(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test24: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test24: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1028,6 +1744,36 @@ define void @test25(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test25: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test25: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1068,6 +1814,36 @@ define void @test26(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test26: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test26: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1108,6 +1884,36 @@ define void @test27(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test27: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gernn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test27: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gernn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1138,6 +1944,26 @@ define void @test28(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test28: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvf32ger wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test28: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvf32ger wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -1177,6 +2003,36 @@ define void @test29(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test29: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gerpp wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test29: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gerpp wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1217,6 +2073,36 @@ define void @test30(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test30: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gerpn wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test30: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gerpn wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1257,6 +2143,36 @@ define void @test31(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test31: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gernp wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test31: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gernp wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1297,6 +2213,36 @@ define void @test32(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test32: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gernn wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test32: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gernn wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1331,6 +2277,30 @@ define void @test33(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test33: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64ger wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test33: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64ger wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <256 x i1>, ptr %vpp, align 32 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %0, <16 x i8> %vc) @@ -1375,6 +2345,40 @@ define void @test34(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test34: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test34: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1420,6 +2424,40 @@ define void @test35(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test35: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gerpn wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test35: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gerpn wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1465,6 +2503,40 @@ define void @test36(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test36: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test36: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1510,6 +2582,40 @@ define void @test37(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test37: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gernn wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test37: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gernn wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1545,6 +2651,30 @@ define void @test38(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test38: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64ger wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test38: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64ger wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <256 x i1>, ptr %vpp, align 32 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %0, <16 x i8> %vc, i32 0, i32 0) @@ -1589,6 +2719,40 @@ define void @test39(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test39: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gerpp wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test39: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gerpp wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1634,6 +2798,40 @@ define void @test40(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test40: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gerpn wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test40: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gerpn wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1679,6 +2877,40 @@ define void @test41(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test41: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gernp wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test41: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gernp wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1724,6 +2956,40 @@ define void @test42(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test42: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test42: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll index 89e5147aecc5f..37d0e69b3beaa 100644 --- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll +++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll @@ -5,6 +5,12 @@ ; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC +; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) declare <512 x i1> @llvm.ppc.mma.xxsetaccz() @@ -64,6 +70,60 @@ define void @testPHI1(ptr %Dst, ptr %Src, i32 signext %Len) { ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testPHI1: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmpwi r5, 3 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: blt cr0, .LBB0_3 +; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-WACC-NEXT: addi r5, r5, -2 +; CHECK-WACC-NEXT: lxv v2, 0(r4) +; CHECK-WACC-NEXT: lxv v3, 16(r4) +; CHECK-WACC-NEXT: mtctr r5 +; CHECK-WACC-NEXT: addi r4, r4, 32 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB0_2: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-WACC-NEXT: addi r4, r4, 16 +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: bdnz .LBB0_2 +; CHECK-WACC-NEXT: .LBB0_3: # %for.cond.cleanup +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 48(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testPHI1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmpwi r5, 3 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: blt cr0, .LBB0_3 +; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-BE-WACC-NEXT: addi r5, r5, -2 +; CHECK-BE-WACC-NEXT: lxv v2, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v3, 16(r4) +; CHECK-BE-WACC-NEXT: mtctr r5 +; CHECK-BE-WACC-NEXT: addi r4, r4, 32 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB0_2: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-BE-WACC-NEXT: addi r4, r4, 16 +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: bdnz .LBB0_2 +; CHECK-BE-WACC-NEXT: .LBB0_3: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <16 x i8>, ptr %Src, align 16 %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1 @@ -161,6 +221,62 @@ define dso_local void @testPHI2(ptr %Dst, ptr %Src, i32 signext %Len) { ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testPHI2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v2, 0(r4) +; CHECK-WACC-NEXT: lxv v3, 16(r4) +; CHECK-WACC-NEXT: lxv vs0, 32(r4) +; CHECK-WACC-NEXT: cmpwi r5, 4 +; CHECK-WACC-NEXT: xvf64ger wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: blt cr0, .LBB1_3 +; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-WACC-NEXT: addi r5, r5, -3 +; CHECK-WACC-NEXT: mtctr r5 +; CHECK-WACC-NEXT: addi r4, r4, 48 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB1_2: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-WACC-NEXT: addi r4, r4, 16 +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: bdnz .LBB1_2 +; CHECK-WACC-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 48(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testPHI2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v2, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v3, 16(r4) +; CHECK-BE-WACC-NEXT: lxv vs0, 32(r4) +; CHECK-BE-WACC-NEXT: cmpwi r5, 4 +; CHECK-BE-WACC-NEXT: xvf64ger wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: blt cr0, .LBB1_3 +; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-BE-WACC-NEXT: addi r5, r5, -3 +; CHECK-BE-WACC-NEXT: mtctr r5 +; CHECK-BE-WACC-NEXT: addi r4, r4, 48 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB1_2: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-BE-WACC-NEXT: addi r4, r4, 16 +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: bdnz .LBB1_2 +; CHECK-BE-WACC-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <16 x i8>, ptr %Src, align 16 %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1 @@ -229,6 +345,28 @@ define void @testImplicitDef(ptr %ptr) { ; CHECK-BE-NEXT: xxmfacc acc0 ; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testImplicitDef: +; CHECK-WACC: # %bb.0: # %label1 +; CHECK-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-WACC-NEXT: bc 12, 4*cr5+lt, .LBB2_2 +; CHECK-WACC-NEXT: # %bb.1: # %label2 +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: .LBB2_2: # %label3 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v2, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testImplicitDef: +; CHECK-BE-WACC: # %bb.0: # %label1 +; CHECK-BE-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-BE-WACC-NEXT: bc 12, 4*cr5+lt, .LBB2_2 +; CHECK-BE-WACC-NEXT: # %bb.1: # %label2 +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: .LBB2_2: # %label3 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 0(r3) +; CHECK-BE-WACC-NEXT: blr label1: br i1 undef, label %label3, label %label2 @@ -312,6 +450,70 @@ define dso_local signext i32 @testNestedPHI(i32 signext %cond, i32 signext %coun ; CHECK-BE-NEXT: stxv vs3, 48(r5) ; CHECK-BE-NEXT: stxv vs2, 32(r5) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testNestedPHI: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmplwi r3, 0 +; CHECK-WACC-NEXT: beq cr0, .LBB3_2 +; CHECK-WACC-NEXT: # %bb.1: # %if.then +; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-WACC-NEXT: cmpwi r4, 1 +; CHECK-WACC-NEXT: bge cr0, .LBB3_3 +; CHECK-WACC-NEXT: b .LBB3_5 +; CHECK-WACC-NEXT: .LBB3_2: +; CHECK-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-WACC-NEXT: cmpwi r4, 1 +; CHECK-WACC-NEXT: blt cr0, .LBB3_5 +; CHECK-WACC-NEXT: .LBB3_3: # %for.body.preheader +; CHECK-WACC-NEXT: addi r3, r4, -1 +; CHECK-WACC-NEXT: clrldi r3, r3, 32 +; CHECK-WACC-NEXT: addi r3, r3, 1 +; CHECK-WACC-NEXT: mtctr r3 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB3_4: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-WACC-NEXT: bdnz .LBB3_4 +; CHECK-WACC-NEXT: .LBB3_5: # %for.cond.cleanup +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: li r3, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r5) +; CHECK-WACC-NEXT: stxv v5, 32(r5) +; CHECK-WACC-NEXT: stxv v2, 16(r5) +; CHECK-WACC-NEXT: stxv v3, 0(r5) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testNestedPHI: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmplwi r3, 0 +; CHECK-BE-WACC-NEXT: beq cr0, .LBB3_2 +; CHECK-BE-WACC-NEXT: # %bb.1: # %if.then +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: cmpwi r4, 1 +; CHECK-BE-WACC-NEXT: bge cr0, .LBB3_3 +; CHECK-BE-WACC-NEXT: b .LBB3_5 +; CHECK-BE-WACC-NEXT: .LBB3_2: +; CHECK-BE-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-BE-WACC-NEXT: cmpwi r4, 1 +; CHECK-BE-WACC-NEXT: blt cr0, .LBB3_5 +; CHECK-BE-WACC-NEXT: .LBB3_3: # %for.body.preheader +; CHECK-BE-WACC-NEXT: addi r3, r4, -1 +; CHECK-BE-WACC-NEXT: clrldi r3, r3, 32 +; CHECK-BE-WACC-NEXT: addi r3, r3, 1 +; CHECK-BE-WACC-NEXT: mtctr r3 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB3_4: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: bdnz .LBB3_4 +; CHECK-BE-WACC-NEXT: .LBB3_5: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: li r3, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r5) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r5) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r5) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r5) +; CHECK-BE-WACC-NEXT: blr entry: %tobool.not = icmp eq i32 %cond, 0 br i1 %tobool.not, label %if.end, label %if.then diff --git a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll index 291cf97fd009e..929bf5f61dd90 100644 --- a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll +++ b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -verify-machineinstrs -mcpu=ppc -mtriple=powerpc64-ibm-aix < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=future \ +; RUN: -mtriple=powerpc64-ibm-aix < %s | FileCheck %s --check-prefix=CHECK-WACC target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512" @@ -38,6 +40,43 @@ define void @baz(i64 %arg) local_unnamed_addr #0 { ; CHECK-NEXT: xxswapd 0, 0 ; CHECK-NEXT: stxv 0, 0(3) ; CHECK-NEXT: blr +; +; CHECK-WACC-LABEL: baz: +; CHECK-WACC: # %bb.0: # %bb +; CHECK-WACC-NEXT: dmxxextfdmr512 34, 36, 0, 0 +; CHECK-WACC-NEXT: xxmrgld 1, 34, 36 +; CHECK-WACC-NEXT: xxswapd 2, 1 +; CHECK-WACC-NEXT: xxlxor 0, 0, 0 +; CHECK-WACC-NEXT: xvnegdp 1, 1 +; CHECK-WACC-NEXT: xvnegdp 2, 2 +; CHECK-WACC-NEXT: xvsubdp 1, 1, 0 +; CHECK-WACC-NEXT: xvsubdp 2, 2, 37 +; CHECK-WACC-NEXT: xvmuldp 1, 1, 0 +; CHECK-WACC-NEXT: xvmuldp 2, 2, 0 +; CHECK-WACC-NEXT: xvmaddadp 1, 0, 0 +; CHECK-WACC-NEXT: xvmaddadp 2, 0, 0 +; CHECK-WACC-NEXT: stxv 1, 0(3) +; CHECK-WACC-NEXT: stxv 2, 0(3) +; CHECK-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-WACC-NEXT: bc 12, 20, L..BB0_2 +; CHECK-WACC-NEXT: # %bb.1: # %bb10 +; CHECK-WACC-NEXT: xvf64gerpp 0, 34, 0 +; CHECK-WACC-NEXT: L..BB0_2: # %bb12 +; CHECK-WACC-NEXT: cmpdi 3, 0 +; CHECK-WACC-NEXT: .align 4 +; CHECK-WACC-NEXT: L..BB0_3: # %bb13 +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: bc 4, 2, L..BB0_3 +; CHECK-WACC-NEXT: # %bb.4: # %bb14 +; CHECK-WACC-NEXT: dmxxextfdmr512 34, 36, 0, 0 +; CHECK-WACC-NEXT: xxlxor 0, 0, 0 +; CHECK-WACC-NEXT: xvsubdp 1, 0, 35 +; CHECK-WACC-NEXT: xxlxor 2, 2, 2 +; CHECK-WACC-NEXT: xvmaddadp 2, 1, 2 +; CHECK-WACC-NEXT: xvadddp 0, 2, 0 +; CHECK-WACC-NEXT: xxswapd 0, 0 +; CHECK-WACC-NEXT: stxv 0, 0(3) +; CHECK-WACC-NEXT: blr bb: %call = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> poison) %extractvalue = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %call, 0 From 2237a18f25dc93b46d478c9c7da6a514362cb6e3 Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.voicu@amd.com> Date: Tue, 4 Nov 2025 16:37:16 +0200 Subject: [PATCH 168/313] [SPIRV] Enable OpenCL max_work_group_size translation via `SPV_INTEL_kernel_attributes` (#165891) This adds BE support for the [`SPV_INTEL_kernel_attributes`](https://github.khronos.org/SPIRV-Registry/extensions/INTEL/SPV_INTEL_kernel_attributes.html) extension. The extension is necessary to encode the rather useful `max_work_group_size` kernel attribute, via `OpExecutionMode MaxWorkgroupSizeINTEL`, which is the only Execution Mode added by the extension that this patch adds full processing for. Future patches will add the other Execution Modes and Capabilities. The test is adapted from the equivalent Translator test; it depends on #165815. --- llvm/docs/SPIRVUsage.rst | 6 ++-- llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp | 5 +++ llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp | 4 ++- llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 4 +++ .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 16 +++++++++- .../max_work_group_size.ll | 32 +++++++++++++++++++ 6 files changed, 63 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst index 85eeabf10244a..99f56a5cbc63a 100644 --- a/llvm/docs/SPIRVUsage.rst +++ b/llvm/docs/SPIRVUsage.rst @@ -187,6 +187,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e - Adds decorations that can be applied to global (module scope) variables. * - ``SPV_INTEL_global_variable_fpga_decorations`` - Adds decorations that can be applied to global (module scope) variables to help code generation for FPGA devices. + * - ``SPV_INTEL_kernel_attributes`` + - Adds execution modes that can be applied to entry points to inform scheduling. * - ``SPV_INTEL_media_block_io`` - Adds additional subgroup block read and write functionality that allow applications to flexibly specify the width and height of the block to read from or write to a 2D image. * - ``SPV_INTEL_memory_access_aliasing`` @@ -226,9 +228,9 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e * - ``SPV_INTEL_fp_max_error`` - Adds the ability to specify the maximum error for floating-point operations. * - ``SPV_INTEL_ternary_bitwise_function`` - - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform. + - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform. * - ``SPV_INTEL_subgroup_matrix_multiply_accumulate`` - - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix. + - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix. * - ``SPV_INTEL_int4`` - Adds support for 4-bit integer type, and allow this type to be used in cooperative matrices. * - ``SPV_KHR_float_controls2`` diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 640b014646f36..0175f2fb3698b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -577,6 +577,11 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size")) outputExecutionModeFromMDNode(FReg, Node, SPIRV::ExecutionMode::SubgroupSize, 0, 0); + if (MDNode *Node = F.getMetadata("max_work_group_size")) { + if (ST->canUseExtension(SPIRV::Extension::SPV_INTEL_kernel_attributes)) + outputExecutionModeFromMDNode( + FReg, Node, SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, 3, 1); + } if (MDNode *Node = F.getMetadata("vec_type_hint")) { MCInst Inst; Inst.setOpcode(SPIRV::OpExecutionMode); diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 96f5dee21bc2a..f0558ebcb6681 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -155,7 +155,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> {"SPV_INTEL_predicated_io", SPIRV::Extension::Extension::SPV_INTEL_predicated_io}, {"SPV_KHR_maximal_reconvergence", - SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}}; + SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}, + {"SPV_INTEL_kernel_attributes", + SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}}; bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName, StringRef ArgValue, diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index db036a55ee6c6..d154a06c6f313 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -2180,6 +2180,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI, MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, SPIRV::ExecutionMode::SubgroupSize, ST); + if (F.getMetadata("max_work_group_size")) + MAI.Reqs.getAndAddRequirements( + SPIRV::OperandCategory::ExecutionModeOperand, + SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, ST); if (F.getMetadata("vec_type_hint")) MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 7d08b29a51a6e..267118364c371 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -587,6 +587,11 @@ defm CooperativeMatrixBFloat16ComponentTypeINTEL : CapabilityOperand<6437, 0, 0, defm RoundToInfinityINTEL : CapabilityOperand<5582, 0, 0, [SPV_INTEL_float_controls2], []>; defm FloatingPointModeINTEL : CapabilityOperand<5583, 0, 0, [SPV_INTEL_float_controls2], []>; defm FunctionFloatControlINTEL : CapabilityOperand<5821, 0, 0, [SPV_INTEL_float_controls2], []>; +defm KernelAttributesINTEL : CapabilityOperand<5892, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// TODO-SPIRV: add these once they are used / tested. +// defm FPGAKernelAttributesINTEL : CapabilityOperand<5897, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// defm FPGAKernelAttributesv2INTEL : CapabilityOperand<6161, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// END TODO-SPIRV defm LongCompositesINTEL : CapabilityOperand<6089, 0, 0, [SPV_INTEL_long_composites], []>; defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_images], []>; defm MemoryAccessAliasingINTEL : CapabilityOperand<5910, 0, 0, [SPV_INTEL_memory_access_aliasing], []>; @@ -805,6 +810,15 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>; defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>; defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>; defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>; +defm MaxWorkgroupSizeINTEL : ExecutionModeOperand<5893, [KernelAttributesINTEL]>; +// TODO-SPIRV: Add the following once they are used / tested. +// defm MaxWorkDimINTEL : ExecutionModeOperand<5894, [KernelAttributesINTEL]>; +// defm NoGlobalOffsetINTEL : ExecutionModeOperand<5895, [KernelAttributesINTEL]>; +// defm NumSIMDWorkitemsINTEL : ExecutionModeOperand<5896, [FPGAKernelAttributesINTEL]>; +// defm SchedulerTargetFmaxMhzINTEL : ExecutionModeOperand<5903, [FPGAKernelAttributesINTEL]>; +// defm StreamingInterfaceINTEL : ExecutionModeOperand<6154, [FPGAKernelAttributesv2INTEL]>; +// defm RegisterMapInterfaceINTEL : ExecutionModeOperand<6160, [FPGAKernelAttributesv2INTEL]>; +// END TODO-SPIRV defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>; defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>; @@ -1919,7 +1933,7 @@ defm GenericCastToPtr : SpecConstantOpOperandsOperand<122, [], [Kernel]>; defm PtrCastToGeneric : SpecConstantOpOperandsOperand<121, [], [Kernel]>; defm Bitcast : SpecConstantOpOperandsOperand<124, [], []>; defm QuantizeToF16 : SpecConstantOpOperandsOperand<116, [], [Shader]>; -// Arithmetic +// Arithmetic defm SNegate : SpecConstantOpOperandsOperand<126, [], []>; defm Not : SpecConstantOpOperandsOperand<200, [], []>; defm IAdd : SpecConstantOpOperandsOperand<128, [], []>; diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll new file mode 100644 index 0000000000000..717771c965496 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll @@ -0,0 +1,32 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s +; %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - -filetype=obj | spirv-val %} +; %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability KernelAttributesINTEL +; CHECK: OpExtension "SPV_INTEL_kernel_attributes" +; CHECK: OpEntryPoint {{.*}} %[[DIM1:[0-9]+]] "Dim1" +; CHECK: OpEntryPoint {{.*}} %[[DIM2:[0-9]+]] "Dim2" +; CHECK: OpEntryPoint {{.*}} %[[DIM3:[0-9]+]] "Dim3" +; CHECK: OpExecutionMode %[[DIM1]] MaxWorkgroupSizeINTEL 4 1 1 +; CHECK: OpExecutionMode %[[DIM2]] MaxWorkgroupSizeINTEL 8 4 1 +; CHECK: OpExecutionMode %[[DIM3]] MaxWorkgroupSizeINTEL 16 8 4 +; CHECK: %[[DIM1]] = OpFunction +; CHECK: %[[DIM2]] = OpFunction +; CHECK: %[[DIM3]] = OpFunction + +define spir_kernel void @Dim1() !max_work_group_size !0 { + ret void +} + +define spir_kernel void @Dim2() !max_work_group_size !1 { + ret void +} + +define spir_kernel void @Dim3() !max_work_group_size !2 { + ret void +} + +!0 = !{i32 4} +!1 = !{i32 8, i32 4} +!2 = !{i32 16, i32 8, i32 4} From bdf02486a3b148ef4302d6948fd8eb76bd037a04 Mon Sep 17 00:00:00 2001 From: Manuel Carrasco <Manuel.Carrasco@amd.com> Date: Tue, 4 Nov 2025 14:38:14 +0000 Subject: [PATCH 169/313] [clang][Driver] Fix crash in --offload-new-driver and -save-temps. (#165606) `clang -x hip foo.c --offload-arch=amdgcnspirv --offload-new-driver -save-temps` was crashing with the following error: ``` /usr/bin/ld: input file 'foo-x86_64-unknown-linux-gnu.o' is the same as output file build/bin/clang-linker-wrapper: error: 'ld' failed ``` The `LinkerWrapperJobAction` [is created](https://github.com/llvm/llvm-project/blob/957598f71bd8baa029d886e59ed9aed60e6e9bb9/clang/lib/Driver/Driver.cpp#L4888) with `types::TY_Object` which makes `Driver::GetNamedOutputPath` assign the same name as the assembler's output and thus causing the crash. --- clang/lib/Driver/Driver.cpp | 13 ++++++++++--- clang/test/Driver/hip-spirv-translator-new-driver.c | 9 +++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 clang/test/Driver/hip-spirv-translator-new-driver.c diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 51618d17a4180..6f6a35b4c8c17 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -6463,9 +6463,16 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, (JA.getOffloadingDeviceKind() == Action::OFK_OpenMP && TC && TC->getTriple().isAMDGPU())); }; - if (!AtTopLevel && JA.getType() == types::TY_LLVM_BC && - (C.getArgs().hasArg(options::OPT_emit_llvm) || - IsAMDRDCInCompilePhase(JA, C.getArgs()))) + + // The linker wrapper may not support the input and output files to be the + // same one, and without it -save-temps can fail. + bool IsLinkerWrapper = + JA.getType() == types::TY_Object && isa<LinkerWrapperJobAction>(JA); + bool IsEmitBitcode = JA.getType() == types::TY_LLVM_BC && + (C.getArgs().hasArg(options::OPT_emit_llvm) || + IsAMDRDCInCompilePhase(JA, C.getArgs())); + + if (!AtTopLevel && (IsLinkerWrapper || IsEmitBitcode)) Suffixed += ".tmp"; Suffixed += '.'; Suffixed += Suffix; diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c new file mode 100644 index 0000000000000..315a74635b9b3 --- /dev/null +++ b/clang/test/Driver/hip-spirv-translator-new-driver.c @@ -0,0 +1,9 @@ +// The --offload-new-driver was crashing when using -save-temps due to a failure in clang-linker-wrapper. +// The input and output files cannot be the same. + +// RUN: %clang --offload-new-driver -### -save-temps -nogpuinc -nogpulib \ +// RUN: --offload-arch=amdgcnspirv -x hip %s 2>&1 \ +// RUN: | FileCheck %s + +// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]" +// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}} From 718a3b268fcff1f985d9ee04a53fb1da4569416a Mon Sep 17 00:00:00 2001 From: YongKang Zhu <yongzhu@fb.com> Date: Tue, 4 Nov 2025 06:49:04 -0800 Subject: [PATCH 170/313] [BOLT][AArch64] Run LDR relaxation (#165787) Replace the current `ADRRelaxationPass` with `AArch64RelaxationPass`, which, besides the existing ADR relaxation, will also run LDR relaxation that for now only handles these two forms of LDR instructions: `ldr Xt, [label]` and `ldr Wt, [label]`. --- bolt/include/bolt/Core/MCPlusBuilder.h | 23 ++++ ...laxationPass.h => AArch64RelaxationPass.h} | 22 ++-- bolt/include/bolt/Passes/FixRelaxationPass.h | 2 +- ...tionPass.cpp => AArch64RelaxationPass.cpp} | 47 ++++--- bolt/lib/Passes/CMakeLists.txt | 2 +- bolt/lib/Rewrite/BinaryPassManager.cpp | 12 +- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 42 ++++++ bolt/test/AArch64/ldr-relaxation.s | 122 ++++++++++++++++++ 8 files changed, 232 insertions(+), 40 deletions(-) rename bolt/include/bolt/Passes/{ADRRelaxationPass.h => AArch64RelaxationPass.h} (51%) rename bolt/lib/Passes/{ADRRelaxationPass.cpp => AArch64RelaxationPass.cpp} (67%) create mode 100644 bolt/test/AArch64/ldr-relaxation.s diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index d666c10885ad5..5e349cd69fb43 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -840,6 +840,16 @@ class MCPlusBuilder { return false; } + virtual bool isLDRWl(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + + virtual bool isLDRXl(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + virtual bool isMOVW(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; @@ -1789,6 +1799,19 @@ class MCPlusBuilder { llvm_unreachable("not implemented"); } + /// Take \p LDRInst and return ADRP+LDR instruction sequence - for + /// + /// ldr x0, [label] + /// + /// the following sequence will be generated: + /// + /// adrp x0, PageBase(label) + /// ldr x0, [x0, PageOffset(label)] + virtual InstructionListType createAdrpLdr(const MCInst &LDRInst, + MCContext *Ctx) const { + llvm_unreachable("not implemented"); + } + /// Return not 0 if the instruction CurInst, in combination with the recent /// history of disassembled instructions supplied by [Begin, End), is a linker /// generated veneer/stub that needs patching. This happens in AArch64 when diff --git a/bolt/include/bolt/Passes/ADRRelaxationPass.h b/bolt/include/bolt/Passes/AArch64RelaxationPass.h similarity index 51% rename from bolt/include/bolt/Passes/ADRRelaxationPass.h rename to bolt/include/bolt/Passes/AArch64RelaxationPass.h index b9f92dec7f03b..b9185a1e34388 100644 --- a/bolt/include/bolt/Passes/ADRRelaxationPass.h +++ b/bolt/include/bolt/Passes/AArch64RelaxationPass.h @@ -1,4 +1,4 @@ -//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===// +//===- bolt/Passes/AArch64RelaxationPass.h ----------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,29 +6,29 @@ // //===----------------------------------------------------------------------===// // -// This file declares the ADRRelaxationPass class, which replaces AArch64 -// non-local ADR instructions with ADRP + ADD due to small offset range of ADR -// instruction (+- 1MB) which could be easily overflowed after BOLT -// optimizations. Such problems are usually connected with errata 843419 -// https://developer.arm.com/documentation/epm048406/2100/ +// This file declares the AArch64RelaxationPass class, which replaces AArch64 +// non-local ADR/LDR instructions with ADRP + ADD/LDR due to small offset +// range of ADR and LDR instruction (+- 1MB) which could be easily overflowed +// after BOLT optimizations. Such problems are usually connected with errata +// 843419: https://developer.arm.com/documentation/epm048406/2100/ // The linker could replace ADRP instruction with ADR in some cases. // //===----------------------------------------------------------------------===// -#ifndef BOLT_PASSES_ADRRELAXATIONPASS_H -#define BOLT_PASSES_ADRRELAXATIONPASS_H +#ifndef BOLT_PASSES_AARCH64RELAXATIONPASS_H +#define BOLT_PASSES_AARCH64RELAXATIONPASS_H #include "bolt/Passes/BinaryPasses.h" namespace llvm { namespace bolt { -class ADRRelaxationPass : public BinaryFunctionPass { +class AArch64RelaxationPass : public BinaryFunctionPass { public: - explicit ADRRelaxationPass(const cl::opt<bool> &PrintPass) + explicit AArch64RelaxationPass(const cl::opt<bool> &PrintPass) : BinaryFunctionPass(PrintPass) {} - const char *getName() const override { return "adr-relaxation"; } + const char *getName() const override { return "aarch64-relaxation"; } /// Pass entry point Error runOnFunctions(BinaryContext &BC) override; diff --git a/bolt/include/bolt/Passes/FixRelaxationPass.h b/bolt/include/bolt/Passes/FixRelaxationPass.h index 50b64480aa62e..cf5a8a1fcb134 100644 --- a/bolt/include/bolt/Passes/FixRelaxationPass.h +++ b/bolt/include/bolt/Passes/FixRelaxationPass.h @@ -1,4 +1,4 @@ -//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===// +//===- bolt/Passes/FixRelaxationPass.h --------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/AArch64RelaxationPass.cpp similarity index 67% rename from bolt/lib/Passes/ADRRelaxationPass.cpp rename to bolt/lib/Passes/AArch64RelaxationPass.cpp index c3954c94a7f92..610adad58cfcb 100644 --- a/bolt/lib/Passes/ADRRelaxationPass.cpp +++ b/bolt/lib/Passes/AArch64RelaxationPass.cpp @@ -1,4 +1,4 @@ -//===- bolt/Passes/ADRRelaxationPass.cpp ----------------------------------===// +//===- bolt/Passes/AArch64RelaxationPass.cpp ------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// // -// This file implements the ADRRelaxationPass class. +// This file implements the AArch64RelaxationPass class. // //===----------------------------------------------------------------------===// -#include "bolt/Passes/ADRRelaxationPass.h" +#include "bolt/Passes/AArch64RelaxationPass.h" #include "bolt/Core/ParallelUtilities.h" #include "bolt/Utils/CommandLineOpts.h" #include <iterator> @@ -20,10 +20,10 @@ using namespace llvm; namespace opts { extern cl::OptionCategory BoltCategory; -static cl::opt<bool> - AdrPassOpt("adr-relaxation", - cl::desc("Replace ARM non-local ADR instructions with ADRP"), - cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden); +static cl::opt<bool> AArch64PassOpt( + "aarch64-relaxation", + cl::desc("Replace ARM non-local ADR/LDR instructions with ADRP"), + cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden); } // namespace opts namespace llvm { @@ -35,7 +35,7 @@ namespace bolt { // jobs and checking the exit flag after it. static bool PassFailed = false; -void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { +void AArch64RelaxationPass::runOnFunction(BinaryFunction &BF) { if (PassFailed) return; @@ -43,10 +43,13 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { for (BinaryBasicBlock &BB : BF) { for (auto It = BB.begin(); It != BB.end(); ++It) { MCInst &Inst = *It; - if (!BC.MIB->isADR(Inst)) + bool IsADR = BC.MIB->isADR(Inst); + + // TODO: Handle other types of LDR (literal, PC-relative) instructions. + if (!IsADR && !BC.MIB->isLDRXl(Inst) && !BC.MIB->isLDRWl(Inst)) continue; - const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst); + const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst, IsADR ? 0 : 1); if (!Symbol) continue; @@ -56,25 +59,27 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { continue; } - // Don't relax ADR if it points to the same function and is in the main - // fragment and BF initial size is < 1MB. + // Don't relax ADR/LDR if it points to the same function and is in the + // main fragment and BF initial size is < 1MB. const unsigned OneMB = 0x100000; if (BF.getSize() < OneMB) { BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol); if (TargetBF == &BF && !BB.isSplit()) continue; - // No relaxation needed if ADR references a basic block in the same + // No relaxation needed if ADR/LDR references a basic block in the same // fragment. if (BinaryBasicBlock *TargetBB = BF.getBasicBlockForLabel(Symbol)) if (BB.getFragmentNum() == TargetBB->getFragmentNum()) continue; } - InstructionListType AdrpAdd; + InstructionListType AdrpMaterialization; { auto L = BC.scopeLock(); - AdrpAdd = BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get()); + AdrpMaterialization = + IsADR ? BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get()) + : BC.MIB->createAdrpLdr(Inst, BC.Ctx.get()); } if (It != BB.begin() && BC.MIB->isNoop(*std::prev(It))) { @@ -88,18 +93,18 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { // invalidate this offset, so we have to rely on linker-inserted NOP to // replace it with ADRP, and abort if it is not present. auto L = BC.scopeLock(); - BC.errs() << "BOLT-ERROR: cannot relax ADR in non-simple function " - << BF << '\n'; + BC.errs() << "BOLT-ERROR: cannot relax " << (IsADR ? "ADR" : "LDR") + << " in non-simple function " << BF << '\n'; PassFailed = true; return; } - It = BB.replaceInstruction(It, AdrpAdd); + It = BB.replaceInstruction(It, AdrpMaterialization); } } } -Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) { - if (!opts::AdrPassOpt || !BC.HasRelocations) +Error AArch64RelaxationPass::runOnFunctions(BinaryContext &BC) { + if (!opts::AArch64PassOpt || !BC.HasRelocations) return Error::success(); ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { @@ -108,7 +113,7 @@ Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) { ParallelUtilities::runOnEachFunction( BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr, - "ADRRelaxationPass"); + "AArch64RelaxationPass"); if (PassFailed) return createFatalBOLTError(""); diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt index d7519518f186f..3197e62faad21 100644 --- a/bolt/lib/Passes/CMakeLists.txt +++ b/bolt/lib/Passes/CMakeLists.txt @@ -1,5 +1,5 @@ add_llvm_library(LLVMBOLTPasses - ADRRelaxationPass.cpp + AArch64RelaxationPass.cpp Aligner.cpp AllocCombiner.cpp AsmDump.cpp diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 782137e807662..1a0f6d75d63e8 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "bolt/Rewrite/BinaryPassManager.h" -#include "bolt/Passes/ADRRelaxationPass.h" +#include "bolt/Passes/AArch64RelaxationPass.h" #include "bolt/Passes/Aligner.h" #include "bolt/Passes/AllocCombiner.h" #include "bolt/Passes/AsmDump.h" @@ -129,10 +129,10 @@ static cl::opt<bool> PrintJTFootprintReduction( cl::desc("print function after jt-footprint-reduction pass"), cl::Hidden, cl::cat(BoltOptCategory)); -static cl::opt<bool> - PrintAdrRelaxation("print-adr-relaxation", - cl::desc("print functions after ADR Relaxation pass"), - cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt<bool> PrintAArch64Relaxation( + "print-adr-ldr-relaxation", + cl::desc("print functions after ADR/LDR Relaxation pass"), cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt<bool> PrintLongJmp("print-longjmp", @@ -517,7 +517,7 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { if (BC.isAArch64()) { Manager.registerPass( - std::make_unique<ADRRelaxationPass>(PrintAdrRelaxation)); + std::make_unique<AArch64RelaxationPass>(PrintAArch64Relaxation)); // Tighten branches according to offset differences between branch and // targets. No extra instructions after this pass, otherwise we may have diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 7769162d67eaf..8a496c566b06b 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -142,6 +142,7 @@ static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) { atomicAdd(Insts.back(), RegTo, RegTmp); return Insts; } + class AArch64MCPlusBuilder : public MCPlusBuilder { public: using MCPlusBuilder::MCPlusBuilder; @@ -583,6 +584,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return Inst.getOpcode() == AArch64::ADDXri; } + bool isLDRWl(const MCInst &Inst) const override { + return Inst.getOpcode() == AArch64::LDRWl; + } + + bool isLDRXl(const MCInst &Inst) const override { + return Inst.getOpcode() == AArch64::LDRXl; + } + MCPhysReg getADRReg(const MCInst &Inst) const { assert((isADR(Inst) || isADRP(Inst)) && "Not an ADR instruction"); assert(MCPlus::getNumPrimeOperands(Inst) != 0 && @@ -602,6 +611,39 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return materializeAddress(Target, Ctx, Reg, Addend); } + InstructionListType createAdrpLdr(const MCInst &LDRInst, + MCContext *Ctx) const override { + assert((isLDRXl(LDRInst) || isLDRWl(LDRInst)) && + "LDR (literal, 32 or 64-bit integer load) instruction expected"); + assert(LDRInst.getOperand(0).isReg() && + "unexpected operand in LDR instruction"); + const MCPhysReg DataReg = LDRInst.getOperand(0).getReg(); + const MCPhysReg AddrReg = + isLDRXl(LDRInst) ? DataReg + : (MCPhysReg)RegInfo->getMatchingSuperReg( + DataReg, AArch64::sub_32, + &RegInfo->getRegClass(AArch64::GPR64RegClassID)); + const MCSymbol *Target = getTargetSymbol(LDRInst, 1); + assert(Target && "missing target symbol in LDR instruction"); + + InstructionListType Insts(2); + Insts[0].setOpcode(AArch64::ADRP); + Insts[0].clear(); + Insts[0].addOperand(MCOperand::createReg(AddrReg)); + Insts[0].addOperand(MCOperand::createImm(0)); + setOperandToSymbolRef(Insts[0], /* OpNum */ 1, Target, 0, Ctx, + ELF::R_AARCH64_NONE); + Insts[1].setOpcode(isLDRXl(LDRInst) ? AArch64::LDRXui : AArch64::LDRWui); + Insts[1].clear(); + Insts[1].addOperand(MCOperand::createReg(DataReg)); + Insts[1].addOperand(MCOperand::createReg(AddrReg)); + Insts[1].addOperand(MCOperand::createImm(0)); + Insts[1].addOperand(MCOperand::createImm(0)); + setOperandToSymbolRef(Insts[1], /* OpNum */ 2, Target, 0, Ctx, + ELF::R_AARCH64_ADD_ABS_LO12_NC); + return Insts; + } + bool isTB(const MCInst &Inst) const { return (Inst.getOpcode() == AArch64::TBNZW || Inst.getOpcode() == AArch64::TBNZX || diff --git a/bolt/test/AArch64/ldr-relaxation.s b/bolt/test/AArch64/ldr-relaxation.s new file mode 100644 index 0000000000000..7632504a01635 --- /dev/null +++ b/bolt/test/AArch64/ldr-relaxation.s @@ -0,0 +1,122 @@ +## Check that LDR relaxation will fail since LDR is inside a non-simple +## function and there is no NOP next to it. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym FAIL=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: not llvm-bolt %t.so -o %t.bolt 2>&1 | FileCheck %s --check-prefix=FAIL + +# FAIL: BOLT-ERROR: cannot relax LDR in non-simple function _start + +.ifdef FAIL + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + br x2 + ldr x0, _foo + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check that LDR relaxation is not needed since the reference is not far away. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym NOT_NEEDED=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=NOT_NEEDED + +# NOT_NEEDED: <_start> +# NOT_NEEDED-NEXT: ldr + +.ifdef NOT_NEEDED + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + ldr x0, _start + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check that LDR relaxation is done in a simple function, where NOP will +## be inserted as needed. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym RELAX_SIMPLE=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX + +# RELAX: adrp +# RELAX-NEXT: ldr + +.ifdef RELAX_SIMPLE + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + ldr x0, _foo + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check that LDR relaxation is done in a non-simple function, where NOP +## exists next to LDR. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym RELAX_NON_SIMPLE=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX + +.ifdef RELAX_NON_SIMPLE + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + br x2 + ldr x0, _foo + nop + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check LDR relaxation works on loading W (low 32-bit of X) registers. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym RELAX_SIMPLE_WREG=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAXW + +# RELAXW: adrp x0 +# RELAXW-NEXT: ldr w0 + +.ifdef RELAX_SIMPLE_WREG + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + ldr w0, _foo + ret + .cfi_endproc +.size _start, .-_start +.endif + + .section .text_cold + .global _foo + .align 3 +_foo: + .long 0x12345678 +.size _foo, .-_foo From b25868157df732208ced87e1e4703311f6011e61 Mon Sep 17 00:00:00 2001 From: zhijian lin <zhijian@ca.ibm.com> Date: Tue, 4 Nov 2025 09:53:45 -0500 Subject: [PATCH 171/313] [NFC] add LLVM_ABI to function getMemcmp declaration (#166192) According to discussion of https://github.com/llvm/llvm-project/pull/153600#discussion_r2356071934 add LLVM_ABI to function getMemcmp declaration --- llvm/include/llvm/CodeGen/SelectionDAG.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 1a5ffb38f2568..0dd4f23c6d85f 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1260,9 +1260,15 @@ class SelectionDAG { /// stack arguments from being clobbered. LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain); - std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl, - SDValue Dst, SDValue Src, SDValue Size, - const CallInst *CI); + /// Lower a memcmp operation into a target library call and return the + /// resulting chain and call result as SelectionDAG SDValues. + LLVM_ABI std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl, + SDValue Dst, SDValue Src, + SDValue Size, + const CallInst *CI); + + /// Lower a strlen operation into a target library call and return the + /// resulting chain and call result as SelectionDAG SDValues. LLVM_ABI std::pair<SDValue, SDValue> getStrlen(SDValue Chain, const SDLoc &dl, SDValue Src, const CallInst *CI); From 3170345e20b77a917a61d02119986a53cbe08486 Mon Sep 17 00:00:00 2001 From: Tomer Shafir <tomer.shafir8@gmail.com> Date: Tue, 4 Nov 2025 17:05:58 +0200 Subject: [PATCH 172/313] [X86] Fix LEA compression on 64 bit (#166334) NDD ADD is only supported on 64 bit, but `LEA32` has `Requires<[Not64BitMode]>`. The reason it doesnt fail upstream is that the predicates check is commented out on `X86MCInstLower.cpp`: ``` // FIXME: Enable feature predicate checks once all the test pass. // X86_MC::verifyInstructionPredicates(MI->getOpcode(), // Subtarget->getFeatureBits()); ``` Introduced by: https://github.com/llvm/llvm-project/pull/158254 --- llvm/lib/Target/X86/X86CompressEVEX.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index c0c7f5adf06ef..ddbd10d8f7eda 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -272,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB, const MachineOperand &Src2 = MI.getOperand(2); bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND; const MCInstrDesc &NewDesc = - ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r); + ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r); if (Is32BitReg) Src1 = getX86SubSuperRegister(Src1, 64); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst) From 5ba746d9a0f7850abbbd5d6f7c5fabcfe0a22e45 Mon Sep 17 00:00:00 2001 From: Rahul Joshi <rjoshi@nvidia.com> Date: Tue, 4 Nov 2025 07:10:53 -0800 Subject: [PATCH 173/313] [NFC][TableGen] Use namespace qualifier to define `RecordKeeperImpl` (#166220) --- llvm/lib/TableGen/Record.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index afce803f3f568..8ad20b45f5e16 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -46,12 +46,11 @@ using namespace llvm; // Context //===----------------------------------------------------------------------===// -namespace llvm::detail { /// This class represents the internal implementation of the RecordKeeper. /// It contains all of the contextual static state of the Record classes. It is /// kept out-of-line to simplify dependencies, and also make it easier for /// internal classes to access the uniquer state of the keeper. -struct RecordKeeperImpl { +struct detail::RecordKeeperImpl { RecordKeeperImpl(RecordKeeper &RK) : SharedBitRecTy(RK), SharedIntRecTy(RK), SharedStringRecTy(RK), SharedDagRecTy(RK), AnyRecord(RK, {}), TheUnsetInit(RK), @@ -99,7 +98,6 @@ struct RecordKeeperImpl { void dumpAllocationStats(raw_ostream &OS) const; }; -} // namespace llvm::detail void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const { // Dump memory allocation related stats. From a2495ff991bdad98abca022e89eb93d5fb13a915 Mon Sep 17 00:00:00 2001 From: Rahul Joshi <rjoshi@nvidia.com> Date: Tue, 4 Nov 2025 07:11:26 -0800 Subject: [PATCH 174/313] [NFC][TableGen] Emit empty lines after/before namespace scope (#166217) Emit empty line after a namespace scope is opened and before its closed. Adjust DirectiveEmitter code empty line emission in response to this to avoid lot of unit test changes. --- llvm/include/llvm/TableGen/CodeGenHelpers.h | 5 ++-- llvm/test/TableGen/directive1.td | 2 ++ llvm/test/TableGen/directive2.td | 3 +++ .../utils/TableGen/Basic/DirectiveEmitter.cpp | 25 ++++++------------- mlir/test/mlir-tblgen/cpp-class-comments.td | 4 +++ 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h index e22c6d4f6d390..95866e306b5ff 100644 --- a/llvm/include/llvm/TableGen/CodeGenHelpers.h +++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h @@ -20,6 +20,7 @@ #include <string> namespace llvm { + // Simple RAII helper for emitting ifdef-undef-endif scope. class IfDefEmitter { public: @@ -57,7 +58,7 @@ class NamespaceEmitter { NamespaceEmitter(raw_ostream &OS, StringRef NameUntrimmed) : Name(trim(NameUntrimmed).str()), OS(OS) { if (!Name.empty()) - OS << "namespace " << Name << " {\n"; + OS << "namespace " << Name << " {\n\n"; } ~NamespaceEmitter() { close(); } @@ -65,7 +66,7 @@ class NamespaceEmitter { // Explicit function to close the namespace scopes. void close() { if (!Closed && !Name.empty()) - OS << "} // namespace " << Name << "\n"; + OS << "\n} // namespace " << Name << "\n"; Closed = true; } diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td index 475faf9254157..8648651f3d714 100644 --- a/llvm/test/TableGen/directive1.td +++ b/llvm/test/TableGen/directive1.td @@ -61,6 +61,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: #include <utility> // CHECK-EMPTY: // CHECK-NEXT: namespace llvm { +// CHECK-EMPTY: // CHECK-NEXT: namespace tdl { // CHECK-EMPTY: // CHECK-NEXT: LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); @@ -176,6 +177,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Clause> { // CHECK-NEXT: static constexpr bool is_iterable = true; // CHECK-NEXT: }; +// CHECK-EMPTY: // CHECK-NEXT: } // namespace llvm // CHECK-EMPTY: // CHECK-NEXT: #endif // LLVM_Tdl_INC diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td index ccc09446b4465..96022d7647440 100644 --- a/llvm/test/TableGen/directive2.td +++ b/llvm/test/TableGen/directive2.td @@ -54,6 +54,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: #include <utility> // CHECK-EMPTY: // CHECK-NEXT: namespace llvm { +// CHECK-EMPTY: // CHECK-NEXT: namespace tdl { // CHECK-EMPTY: // CHECK-NEXT: enum class Association { @@ -132,6 +133,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: LLVM_ABI Association getDirectiveAssociation(Directive D); // CHECK-NEXT: LLVM_ABI Category getDirectiveCategory(Directive D); // CHECK-NEXT: LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D); +// CHECK-EMPTY: // CHECK-NEXT: } // namespace tdl // CHECK-EMPTY: // CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Association> { @@ -149,6 +151,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Clause> { // CHECK-NEXT: static constexpr bool is_iterable = true; // CHECK-NEXT: }; +// CHECK-EMPTY: // CHECK-NEXT: } // namespace llvm // CHECK-EMPTY: // CHECK-NEXT: #endif // LLVM_Tdl_INC diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp index 3c6ff1132230b..d33bf45595e2e 100644 --- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp +++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp @@ -81,6 +81,7 @@ static void generateEnumExports(ArrayRef<const Record *> Records, std::string N = getIdentifierName(R, Prefix); OS << "constexpr auto " << N << " = " << Enum << "::" << N << ";\n"; } + OS << "\n"; } // Generate enum class. Entries are emitted in the order in which they appear @@ -88,7 +89,6 @@ static void generateEnumExports(ArrayRef<const Record *> Records, static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS, StringRef Enum, StringRef Prefix, bool ExportEnums) { - OS << "\n"; OS << "enum class " << Enum << " {\n"; if (!Records.empty()) { std::string N; @@ -104,17 +104,15 @@ static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS, OS << "};\n"; OS << "\n"; OS << "static constexpr std::size_t " << Enum - << "_enumSize = " << Records.size() << ";\n"; + << "_enumSize = " << Records.size() << ";\n\n"; // Make the enum values available in the defined namespace. This allows us to // write something like Enum_X if we have a `using namespace <CppNamespace>`. // At the same time we do not loose the strong type guarantees of the enum // class, that is we cannot pass an unsigned as Directive without an explicit // cast. - if (ExportEnums) { - OS << "\n"; + if (ExportEnums) generateEnumExports(Records, OS, Enum, Prefix); - } } // Generate enum class with values corresponding to different bit positions. @@ -127,7 +125,6 @@ static void generateEnumBitmask(ArrayRef<const Record *> Records, StringRef Type = Records.size() <= 32 ? "uint32_t" : "uint64_t"; StringRef TypeSuffix = Records.size() <= 32 ? "U" : "ULL"; - OS << "\n"; OS << "enum class " << Enum << " : " << Type << " {\n"; std::string LastName; for (auto [I, R] : llvm::enumerate(Records)) { @@ -138,17 +135,15 @@ static void generateEnumBitmask(ArrayRef<const Record *> Records, OS << "};\n"; OS << "\n"; OS << "static constexpr std::size_t " << Enum - << "_enumSize = " << Records.size() << ";\n"; + << "_enumSize = " << Records.size() << ";\n\n"; // Make the enum values available in the defined namespace. This allows us to // write something like Enum_X if we have a `using namespace <CppNamespace>`. // At the same time we do not loose the strong type guarantees of the enum // class, that is we cannot pass an unsigned as Directive without an explicit // cast. - if (ExportEnums) { - OS << "\n"; + if (ExportEnums) generateEnumExports(Records, OS, Enum, Prefix); - } } // Generate enums for values that clauses can take. @@ -170,7 +165,6 @@ static void generateClauseEnumVal(ArrayRef<const Record *> Records, return; } - OS << "\n"; OS << "enum class " << Enum << " {\n"; for (const EnumVal Val : ClauseVals) OS << " " << Val.getRecordName() << "=" << Val.getValue() << ",\n"; @@ -182,6 +176,7 @@ static void generateClauseEnumVal(ArrayRef<const Record *> Records, OS << "constexpr auto " << CV->getName() << " = " << Enum << "::" << CV->getName() << ";\n"; } + OS << "\n"; EnumHelperFuncs += (Twine("LLVM_ABI ") + Twine(Enum) + Twine(" get") + Twine(Enum) + Twine("(StringRef Str);\n")) .str(); @@ -284,7 +279,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { NamespaceEmitter DirLangNS(OS, DirLang.getCppNamespace()); if (DirLang.hasEnableBitmaskEnumInNamespace()) - OS << "\nLLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n"; + OS << "LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n\n"; // Emit Directive associations std::vector<const Record *> Associations; @@ -315,7 +310,6 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { generateClauseEnumVal(DirLang.getClauses(), OS, DirLang, EnumHelperFuncs); // Generic function signatures - OS << "\n"; OS << "// Enumeration helper functions\n"; OS << "LLVM_ABI std::pair<Directive, directive::VersionRange> get" << Lang @@ -353,10 +347,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { OS << "LLVM_ABI Association getDirectiveAssociation(Directive D);\n"; OS << "LLVM_ABI Category getDirectiveCategory(Directive D);\n"; OS << "LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);\n"; - if (EnumHelperFuncs.length() > 0) { - OS << EnumHelperFuncs; - OS << "\n"; - } + OS << EnumHelperFuncs; DirLangNS.close(); diff --git a/mlir/test/mlir-tblgen/cpp-class-comments.td b/mlir/test/mlir-tblgen/cpp-class-comments.td index 9dcf975e45286..0d3445d6647af 100644 --- a/mlir/test/mlir-tblgen/cpp-class-comments.td +++ b/mlir/test/mlir-tblgen/cpp-class-comments.td @@ -36,6 +36,7 @@ def A_SomeOp1 : Op<A_Dialect, "some_op1", []>{ let cppNamespace = "OP1"; // OP: namespace OP1 +// OP-EMPTY: // OP-NEXT: /// Some Op1 summary line1 // OP-NEXT: /// summary line2 // OP-NEXT: /// Some Op1 description @@ -97,6 +98,7 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> { let methods = [ ]; // ATTR-INTERFACE: namespace mlir::a::traits { +// ATTR-INTERFACE-EMPTY: // ATTR-INTERFACE-NEXT: /// Common trait for all layouts. // ATTR-INTERFACE-NEXT: class EncodingTrait; } @@ -104,6 +106,7 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> { def SimpleEncodingTrait : AttrInterface<"SimpleEncodingTrait"> { let cppNamespace = "a::traits"; // ATTR-INTERFACE: namespace a::traits { +// ATTR-INTERFACE-EMPTY: // ATTR-INTERFACE-NEXT: class SimpleEncodingTrait; } @@ -114,6 +117,7 @@ def SimpleOpInterface : OpInterface<"SimpleOpInterface"> { Simple Op Interface description }]; // OP-INTERFACE: namespace a::traits { +// OP-INTERFACE-EMPTY: // OP-INTERFACE-NEXT: /// Simple Op Interface description // OP-INTERFACE-NEXT: class SimpleOpInterface; } From c2269c842d28c28c87d41c4afc7b858038e73ad4 Mon Sep 17 00:00:00 2001 From: Kazu Hirata <kazu@google.com> Date: Tue, 4 Nov 2025 07:12:29 -0800 Subject: [PATCH 175/313] [ADT] Move llvm::to_address to STLForwardCompat.h (NFC) (#166315) This patch moves llvm::to_address to STLForwardCompat.h, a collection of backports from C++20 and beyond. --- llvm/include/llvm/ADT/STLExtras.h | 10 ---------- llvm/include/llvm/ADT/STLForwardCompat.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h index 8de8eb5b86640..af0e4a36be1b1 100644 --- a/llvm/include/llvm/ADT/STLExtras.h +++ b/llvm/include/llvm/ADT/STLExtras.h @@ -2600,16 +2600,6 @@ bool hasNItemsOrLess(ContainerTy &&C, unsigned N) { return hasNItemsOrLess(adl_begin(C), adl_end(C), N); } -/// Returns a raw pointer that represents the same address as the argument. -/// -/// This implementation can be removed once we move to C++20 where it's defined -/// as std::to_address(). -/// -/// The std::pointer_traits<>::to_address(p) variations of these overloads has -/// not been implemented. -template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); } -template <class T> constexpr T *to_address(T *P) { return P; } - // Detect incomplete types, relying on the fact that their size is unknown. namespace detail { template <typename T> using has_sizeof = decltype(sizeof(T)); diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h index e02694f043fbb..ad94cdede9288 100644 --- a/llvm/include/llvm/ADT/STLForwardCompat.h +++ b/llvm/include/llvm/ADT/STLForwardCompat.h @@ -134,6 +134,16 @@ struct identity // NOLINT(readability-identifier-naming) } }; +/// Returns a raw pointer that represents the same address as the argument. +/// +/// This implementation can be removed once we move to C++20 where it's defined +/// as std::to_address(). +/// +/// The std::pointer_traits<>::to_address(p) variations of these overloads has +/// not been implemented. +template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); } +template <class T> constexpr T *to_address(T *P) { return P; } + //===----------------------------------------------------------------------===// // Features from C++23 //===----------------------------------------------------------------------===// From 370058777be2282fe18d62315adafdda3960d012 Mon Sep 17 00:00:00 2001 From: Kazu Hirata <kazu@google.com> Date: Tue, 4 Nov 2025 07:12:37 -0800 Subject: [PATCH 176/313] [BinaryFormat] Remove redundant declarations (NFC) (#166316) In C++17, static constexpr members are implicitly inline, so they no longer require an out-of-line definition. Once we remove the redundant declarations, Minidump.cpp becomes effectively empty, so this patch removes the file. Identified with readability-redundant-declaration. --- llvm/lib/BinaryFormat/CMakeLists.txt | 1 - llvm/lib/BinaryFormat/Minidump.cpp | 14 -------------- 2 files changed, 15 deletions(-) delete mode 100644 llvm/lib/BinaryFormat/Minidump.cpp diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt index 4b2debb7ae236..0c8af1e7a4565 100644 --- a/llvm/lib/BinaryFormat/CMakeLists.txt +++ b/llvm/lib/BinaryFormat/CMakeLists.txt @@ -6,7 +6,6 @@ add_llvm_component_library(LLVMBinaryFormat ELF.cpp MachO.cpp Magic.cpp - Minidump.cpp MsgPackDocument.cpp MsgPackDocumentYAML.cpp MsgPackReader.cpp diff --git a/llvm/lib/BinaryFormat/Minidump.cpp b/llvm/lib/BinaryFormat/Minidump.cpp deleted file mode 100644 index b618fb1570126..0000000000000 --- a/llvm/lib/BinaryFormat/Minidump.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//===-- Minidump.cpp - Minidump constants and structures ---------*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/BinaryFormat/Minidump.h" - -using namespace llvm::minidump; - -constexpr uint32_t Header::MagicSignature; -constexpr uint16_t Header::MagicVersion; From 502742b5386836f152bb4642c0505274ef08c2d6 Mon Sep 17 00:00:00 2001 From: Kazu Hirata <kazu@google.com> Date: Tue, 4 Nov 2025 07:12:45 -0800 Subject: [PATCH 177/313] [llvm] Proofread MergeFunctions.rst (#166317) --- llvm/docs/MergeFunctions.rst | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/llvm/docs/MergeFunctions.rst b/llvm/docs/MergeFunctions.rst index d43b9c3a89091..d64c846687bae 100644 --- a/llvm/docs/MergeFunctions.rst +++ b/llvm/docs/MergeFunctions.rst @@ -8,9 +8,9 @@ MergeFunctions pass, how it works Introduction ============ Sometimes code contains equal functions, or functions that do exactly the same -thing even though they are non-equal on the IR level (e.g.: multiplication on 2 -and 'shl 1'). This can happen for several reasons: mainly, the usage of -templates and automatic code generators. Though, sometimes the user itself could +thing even though they are non-equal on the IR level (e.g.,: multiplication on 2 +and ``shl 1``). This can happen for several reasons: mainly, the usage of +templates and automatic code generators. However, sometimes the user itself could write the same thing twice :-) The main purpose of this pass is to recognize such functions and merge them. @@ -20,21 +20,21 @@ describes the algorithm used to compare functions and explains how we could combine equal functions correctly to keep the module valid. -Material is brought in a top-down form, so the reader could start to learn pass +The material is presented in a top-down form, so the reader could start to learn pass from high level ideas and end with low-level algorithm details, thus preparing him or her for reading the sources. The main goal is to describe the algorithm and logic here and the concept. If you *don't want* to read the source code, but want to understand pass algorithms, this document is good for you. The author tries not to repeat the -source-code and covers only common cases to avoid the cases of needing to +source code and covers only common cases to avoid the cases of needing to update this document after any minor code changes. What should I know to be able to follow along with this document? ----------------------------------------------------------------- -The reader should be familiar with common compile-engineering principles and +The reader should be familiar with common compiler-engineering principles and LLVM code fundamentals. In this article, we assume the reader is familiar with `Single Static Assignment <http://en.wikipedia.org/wiki/Static_single_assignment_form>`_ @@ -99,7 +99,7 @@ and a ``void*`` as equal. This is just an example; more possible details are described a bit below. As another example, the reader may imagine two more functions. The first -function performs a multiplication by 2, while the second one performs an +function performs a multiplication by 2, while the second one performs a logical left shift by 1. Possible solutions @@ -131,7 +131,7 @@ access lookup? The answer is: "yes". Random-access """"""""""""" How can this be done? Just convert each function to a number, and gather -all of them in a special hash-table. Functions with equal hashes are equal. +all of them in a special hash table. Functions with equal hashes are equal. Good hashing means, that every function part must be taken into account. That means we have to convert every function part into some number, and then add it into the hash. The lookup-up time would be small, but such an approach adds some @@ -175,7 +175,7 @@ merged with each other. It is defined as: ``std::set<FunctionNode> FnTree;`` -Here ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with +Here, ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with an implemented “<” operator among the functions set (below we explain how it works exactly; this is a key point in fast functions comparison). @@ -207,7 +207,7 @@ from method. Comparison and logarithmical search """"""""""""""""""""""""""""""""""" Let's recall our task: for every function *F* from module *M*, we have to find -equal functions *F`* in the shortest time possible , and merge them into a +equal functions *F`* in the shortest time possible and merge them into a single function. Defining total ordering among the functions set allows us to organize @@ -225,7 +225,7 @@ possible values: 1, left is *greater* than right. -Of course it means, that we have to maintain +Of course, it means that we have to maintain *strict and non-strict order relation properties*: * reflexivity (``a <= a``, ``a == a``, ``a >= a``), @@ -235,7 +235,7 @@ Of course it means, that we have to maintain As mentioned before, the comparison routine consists of "sub-comparison-routines", with each of them also consisting of -"sub-comparison-routines", and so on. Finally, it ends up with primitive +"sub-comparison-routines", and so on. Finally, it ends up with a primitive comparison. Below, we will use the following operations: @@ -275,7 +275,7 @@ A brief look at the source code tells us that the comparison starts in the “``int FunctionComparator::compare(void)``” method. 1. The first parts to be compared are the function's attributes and some -properties that is outside the “attributes” term, but still could make the +properties that are outside the “attributes” term, but still could make the function different without changing its body. This part of the comparison is usually done within simple *cmpNumbers* or *cmpFlags* operations (e.g. ``cmpFlags(F1->hasGC(), F2->hasGC())``). Below is a full list of function's @@ -365,7 +365,7 @@ comparing them as numbers. 7. Complex types (structures, arrays, etc.). Follow complex objects comparison technique (see the very first paragraph of this chapter). Both *left* and *right* are to be expanded and their element types will be checked the same -way. If we get -1 or 1 on some stage, return it. Otherwise return 0. +way. If we get -1 or 1 on some stage, return it. Otherwise, return 0. 8. Steps 1-6 describe all the possible cases, if we passed steps 1-6 and didn't get any conclusions, then invoke ``llvm_unreachable``, since it's quite an @@ -445,7 +445,7 @@ How to implement cmpValues? but, in general, we need to implement antisymmetric relation. As mentioned above, to understand what is *less*, we can use order in which we meet values. If both values have the same order in a function (met at the same -time), we then treat values as *associated*. Otherwise – it depends on who was +time), we then treat values as *associated*. Otherwise, it depends on who was first. Every time we run the top-level compare method, we initialize two identical @@ -623,7 +623,7 @@ to use ``accumulateConstantOffset`` method. So, if we get constant offset for both left and right *GEPs*, then compare it as numbers, and return comparison result. -Otherwise treat it like a regular operation (see previous paragraph). +Otherwise, treat it like a regular operation (see previous paragraph). cmpOperation ------------ @@ -742,7 +742,7 @@ We call ``writeThunkOrAlias(Function *F, Function *G)``. Here we try to replace referenced anywhere, * function should come with external, local or weak linkage. -Otherwise we write thunk: some wrapper that has *G's* interface and calls *F*, +Otherwise, we write thunk: some wrapper that has *G's* interface and calls *F*, so *G* could be replaced with this wrapper. *writeAlias* @@ -772,7 +772,7 @@ As it written in method comments: “Replace G with a simple tail call to bitcast(F). Also replace direct uses of G with bitcast(F). Deletes G.” -In general it does the same as usual when we want to replace callee, except the +In general, it does the same as usual when we want to replace callee, except the first point: 1. We generate tail call wrapper around *F*, but with an interface that allows using From 50faea28fb93c5938391fdc0a2cfd70b28280537 Mon Sep 17 00:00:00 2001 From: Kazu Hirata <kazu@google.com> Date: Tue, 4 Nov 2025 07:12:53 -0800 Subject: [PATCH 178/313] [llvm] Use conventional enum declarations (NFC) (#166318) This patch replaces: using Foo = enum { A, B, C }; with the more conventional: enum Foo { A, B, C }; These two enum declaration styles are not identical, but their difference does not matter in these .cpp files. With the "using Foo" style, the enum is unnamed and cannot be forward-declared, whereas the conventional style creates a named enum that can be. Since these changes are confined to .cpp files, this distinction has no practical impact here. --- llvm/lib/Target/AArch64/AArch64FastISel.cpp | 5 +---- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp | 2 +- llvm/lib/Target/ARM/ARMFastISel.cpp | 2 +- llvm/lib/Target/Mips/MipsFastISel.cpp | 2 +- llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp | 2 +- llvm/lib/Target/X86/X86VZeroUpper.cpp | 2 +- llvm/unittests/ADT/FallibleIteratorTest.cpp | 4 ++-- 8 files changed, 9 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index cf344980cbaae..18e246e5af57d 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -81,10 +81,7 @@ namespace { class AArch64FastISel final : public FastISel { class Address { public: - using BaseKind = enum { - RegBase, - FrameIndexBase - }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 52cc4ca5a955c..1a14629fb66b3 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -435,7 +435,7 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; +enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound }; using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 959ce6904ce4d..1682abbdea169 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -43,7 +43,7 @@ class GCNNSAReassignImpl { bool run(MachineFunction &MF); private: - using NSA_Status = enum { + enum NSA_Status { NOT_NSA, // Not an NSA instruction FIXED, // NSA which we cannot modify NON_CONTIGUOUS, // NSA with non-sequential address which we can try diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 14e1160e70dae..88d3b6f7d5bb9 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -86,7 +86,7 @@ namespace { // All possible address modes, plus some. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index df0c8c13fa38d..06210b6b91b93 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -82,7 +82,7 @@ class MipsFastISel final : public FastISel { // All possible address modes. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 2666342d0c7b9..66ed8b078b808 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -46,7 +46,7 @@ class WebAssemblyFastISel final : public FastISel { // All possible address modes. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp index f6f7e92d98578..2f28ab36aa193 100644 --- a/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -66,7 +66,7 @@ namespace { MachineBasicBlock &MBB); void addDirtySuccessor(MachineBasicBlock &MBB); - using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; + enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; static const char* getBlockExitStateName(BlockExitState ST); diff --git a/llvm/unittests/ADT/FallibleIteratorTest.cpp b/llvm/unittests/ADT/FallibleIteratorTest.cpp index d3389744ffbfe..c17aa0393dfcb 100644 --- a/llvm/unittests/ADT/FallibleIteratorTest.cpp +++ b/llvm/unittests/ADT/FallibleIteratorTest.cpp @@ -19,8 +19,8 @@ using namespace llvm; namespace { -using ItemValid = enum { ValidItem, InvalidItem }; -using LinkValid = enum { ValidLink, InvalidLink }; +enum ItemValid { ValidItem, InvalidItem }; +enum LinkValid { ValidLink, InvalidLink }; class Item { public: From bcb3d2f5122276ed9969fe2b2ef4428652800377 Mon Sep 17 00:00:00 2001 From: Hans Wennborg <hans@hanshq.net> Date: Tue, 4 Nov 2025 16:41:30 +0100 Subject: [PATCH 179/313] build_llvm_release.bat fixes (#166385) Some followups after #131687 switched to the "runtimes build". - The `check-sanitizer` build target doesn't exist in the runtimes build; use `check-runtimes` instead. - ASan is not supported on 32-bit windows. Pass `-DCOMPILER_RT_BUILD_SANITIZERS=OFF` - `check-runtimes` includes the orcjit tests, which never passed on windows; build with `-DCOMPILER_RT_BUILD_ORC=OFF` - Various asan and libfuzzer tests fail; suppress them with `LIT_FILTER_OUT` --- llvm/utils/release/build_llvm_release.bat | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat index 001339f2a8f05..0764c7af86c0a 100644 --- a/llvm/utils/release/build_llvm_release.bat +++ b/llvm/utils/release/build_llvm_release.bat @@ -1,6 +1,9 @@ @echo off -setlocal enabledelayedexpansion +REM Filter out tests that are known to fail. +set "LIT_FILTER_OUT=gh110231.cpp|crt_initializers.cpp|init-order-atexit.cpp|use_after_return_linkage.cpp|initialization-bug.cpp|initialization-bug-no-global.cpp|trace-malloc-unbalanced.test|trace-malloc-2.test|TraceMallocTest" + +setlocal enabledelayedexpansion goto begin :usage @@ -24,6 +27,7 @@ echo. echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64 exit /b 1 + :begin ::============================================================================== @@ -163,7 +167,8 @@ set common_cmake_flags=^ -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^ -DLLVM_ENABLE_RPMALLOC=ON ^ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld" ^ - -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp" + -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp" ^ + -DCOMPILER_RT_BUILD_ORC=OFF if "%force-msvc%" == "" ( where /q clang-cl @@ -215,6 +220,7 @@ set "stage0_bin_dir=%build_dir%/build32_stage0/bin" set cmake_flags=^ %common_cmake_flags% ^ -DLLVM_ENABLE_RPMALLOC=OFF ^ + -DCOMPILER_RT_BUILD_SANITIZERS=OFF ^ -DPython3_ROOT_DIR=%PYTHONHOME% ^ -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib @@ -224,7 +230,7 @@ ninja || ninja || ninja || exit /b 1 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 cd.. @@ -233,6 +239,7 @@ REM with forward slash. set all_cmake_flags=^ %cmake_flags% ^ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;" ^ + -DCOMPILER_RT_BUILD_SANITIZERS=OFF ^ %common_lldb_flags% ^ -DPYTHON_HOME=%PYTHONHOME% ^ -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ @@ -249,7 +256,7 @@ ninja || ninja || ninja || exit /b 1 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 ninja package || exit /b 1 cd .. From 2286118e6f2cda56b78d2e6b0193dd6f0ca7b7ea Mon Sep 17 00:00:00 2001 From: Alex Voicu <alexandru.voicu@amd.com> Date: Tue, 4 Nov 2025 18:10:26 +0200 Subject: [PATCH 180/313] [SPIRV] Enable `bfloat16` arithmetic (#166031) Enable the `SPV_INTEL_bfloat16_arithmetic` extension, which allows arithmetic, relational and `OpExtInst` instructions to take `bfloat16` arguments. This patch only adds support to arithmetic and relational ops. The extension itself is rather fresh, but `bfloat16` is ubiquitous at this point and not supporting these ops is limiting. --- llvm/docs/SPIRVUsage.rst | 2 + llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 16 +- llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp | 2 + llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 61 ++- .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 3 + .../bfloat16-arithmetic.ll | 142 +++++++ .../bfloat16-relational.ll | 376 ++++++++++++++++++ 7 files changed, 595 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst index 99f56a5cbc63a..749961356d23e 100644 --- a/llvm/docs/SPIRVUsage.rst +++ b/llvm/docs/SPIRVUsage.rst @@ -173,6 +173,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e - Allows generating arbitrary width integer types. * - ``SPV_INTEL_bindless_images`` - Adds instructions to convert convert unsigned integer handles to images, samplers and sampled images. + * - ``SPV_INTEL_bfloat16_arithmetic`` + - Allows the use of 16-bit bfloat16 values in arithmetic and relational operators. * - ``SPV_INTEL_bfloat16_conversion`` - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values. * - ``SPV_INTEL_cache_controls`` diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 1fc90d0852aad..4fd220481cedb 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -294,6 +294,10 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) { MachinePreds[Edge].push_back(NewPred); } +static bool targetSupportsBF16Type(const MachineFunction *MF) { + return MF->getTarget().getTargetTriple().isSPIRV(); +} + static bool containsBF16Type(const User &U) { // BF16 cannot currently be represented by LLT, to avoid miscompiles we // prevent any instructions using them. FIXME: This can be removed once LLT @@ -306,7 +310,7 @@ static bool containsBF16Type(const User &U) { bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; // Get or create a virtual register for each value. @@ -328,7 +332,7 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; Register Op0 = getOrCreateVReg(*U.getOperand(0)); @@ -348,7 +352,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) { bool IRTranslator::translateCompare(const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; auto *CI = cast<CmpInst>(&U); @@ -1569,7 +1573,7 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; uint32_t Flags = 0; @@ -2688,7 +2692,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, bool IRTranslator::translateInlineAsm(const CallBase &CB, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(CB)) + if (containsBF16Type(CB) && !targetSupportsBF16Type(MF)) return false; const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering(); @@ -2779,7 +2783,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB, } bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { - if (!MF->getTarget().getTargetTriple().isSPIRV() && containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; const CallInst &CI = cast<CallInst>(U); diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index f0558ebcb6681..43b2869cecdf7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -107,6 +107,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> SPIRV::Extension::Extension::SPV_INTEL_inline_assembly}, {"SPV_INTEL_bindless_images", SPIRV::Extension::Extension::SPV_INTEL_bindless_images}, + {"SPV_INTEL_bfloat16_arithmetic", + SPIRV::Extension::Extension::SPV_INTEL_bfloat16_arithmetic}, {"SPV_INTEL_bfloat16_conversion", SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion}, {"SPV_KHR_subgroup_rotate", diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index d154a06c6f313..e5ac76c405841 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1435,6 +1435,8 @@ void addInstrRequirements(const MachineInstr &MI, addPrintfRequirements(MI, Reqs, ST); break; } + // TODO: handle bfloat16 extended instructions when + // SPV_INTEL_bfloat16_arithmetic is enabled. break; } case SPIRV::OpAliasDomainDeclINTEL: @@ -2060,7 +2062,64 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::PredicatedIOINTEL); break; } - + case SPIRV::OpFAddS: + case SPIRV::OpFSubS: + case SPIRV::OpFMulS: + case SPIRV::OpFDivS: + case SPIRV::OpFRemS: + case SPIRV::OpFMod: + case SPIRV::OpFNegate: + case SPIRV::OpFAddV: + case SPIRV::OpFSubV: + case SPIRV::OpFMulV: + case SPIRV::OpFDivV: + case SPIRV::OpFRemV: + case SPIRV::OpFNegateV: { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + SPIRVType *TypeDef = MRI.getVRegDef(MI.getOperand(1).getReg()); + if (TypeDef->getOpcode() == SPIRV::OpTypeVector) + TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg()); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic)) + report_fatal_error( + "Arithmetic instructions with bfloat16 arguments require the " + "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic); + Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL); + } + break; + } + case SPIRV::OpOrdered: + case SPIRV::OpUnordered: + case SPIRV::OpFOrdEqual: + case SPIRV::OpFOrdNotEqual: + case SPIRV::OpFOrdLessThan: + case SPIRV::OpFOrdLessThanEqual: + case SPIRV::OpFOrdGreaterThan: + case SPIRV::OpFOrdGreaterThanEqual: + case SPIRV::OpFUnordEqual: + case SPIRV::OpFUnordNotEqual: + case SPIRV::OpFUnordLessThan: + case SPIRV::OpFUnordLessThanEqual: + case SPIRV::OpFUnordGreaterThan: + case SPIRV::OpFUnordGreaterThanEqual: { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + MachineInstr *OperandDef = MRI.getVRegDef(MI.getOperand(2).getReg()); + SPIRVType *TypeDef = MRI.getVRegDef(OperandDef->getOperand(1).getReg()); + if (TypeDef->getOpcode() == SPIRV::OpTypeVector) + TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg()); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic)) + report_fatal_error( + "Relational instructions with bfloat16 arguments require the " + "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic); + Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL); + } + break; + } default: break; } diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 267118364c371..1b4b29bbb160a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -387,6 +387,8 @@ defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>; defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>; defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>; defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>; +defm SPV_INTEL_bfloat16_arithmetic + : ExtensionOperand<129, [EnvVulkan, EnvOpenCL]>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -570,6 +572,7 @@ defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atom defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>; defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>; defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>; +defm BFloat16ArithmeticINTEL : CapabilityOperand<6226, 0, 0, [SPV_INTEL_bfloat16_arithmetic], []>; defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>; defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>; defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>; diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll new file mode 100644 index 0000000000000..4cabddb94df25 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll @@ -0,0 +1,142 @@ +; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %} + +; CHECK-ERROR: LLVM ERROR: Arithmetic instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic + +; CHECK-DAG: OpCapability BFloat16TypeKHR +; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL +; CHECK-DAG: OpExtension "SPV_KHR_bfloat16" +; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic" +; CHECK-DAG: OpName [[NEG:%.*]] "neg" +; CHECK-DAG: OpName [[NEGV:%.*]] "negv" +; CHECK-DAG: OpName [[ADD:%.*]] "add" +; CHECK-DAG: OpName [[ADDV:%.*]] "addv" +; CHECK-DAG: OpName [[SUB:%.*]] "sub" +; CHECK-DAG: OpName [[SUBV:%.*]] "subv" +; CHECK-DAG: OpName [[MUL:%.*]] "mul" +; CHECK-DAG: OpName [[MULV:%.*]] "mulv" +; CHECK-DAG: OpName [[DIV:%.*]] "div" +; CHECK-DAG: OpName [[DIVV:%.*]] "divv" +; CHECK-DAG: OpName [[REM:%.*]] "rem" +; CHECK-DAG: OpName [[REMV:%.*]] "remv" +; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0 +; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 4 + +; CHECK-DAG: [[NEG]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOAT]] [[X]] +define spir_func bfloat @neg(bfloat %x) { +entry: + %r = fneg bfloat %x + ret bfloat %r +} + +; CHECK-DAG: [[NEGV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOATV]] [[X]] +define spir_func <4 x bfloat> @negv(<4 x bfloat> %x) { +entry: + %r = fneg <4 x bfloat> %x + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[ADD]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @add(bfloat %x, bfloat %y) { +entry: + %r = fadd bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[ADDV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @addv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fadd <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[SUB]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @sub(bfloat %x, bfloat %y) { +entry: + %r = fsub bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[SUBV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @subv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fsub <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[MUL]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @mul(bfloat %x, bfloat %y) { +entry: + %r = fmul bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[MULV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @mulv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fmul <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[DIV]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @div(bfloat %x, bfloat %y) { +entry: + %r = fdiv bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[DIVV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @divv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fdiv <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[REM]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @rem(bfloat %x, bfloat %y) { +entry: + %r = frem bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[REMV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @remv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = frem <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll new file mode 100644 index 0000000000000..3774791d58f87 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll @@ -0,0 +1,376 @@ +; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %} + +; CHECK-ERROR: LLVM ERROR: Relational instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic + +; CHECK-DAG: OpCapability BFloat16TypeKHR +; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL +; CHECK-DAG: OpExtension "SPV_KHR_bfloat16" +; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic" +; CHECK-DAG: OpName [[UEQ:%.*]] "test_ueq" +; CHECK-DAG: OpName [[OEQ:%.*]] "test_oeq" +; CHECK-DAG: OpName [[UNE:%.*]] "test_une" +; CHECK-DAG: OpName [[ONE:%.*]] "test_one" +; CHECK-DAG: OpName [[ULT:%.*]] "test_ult" +; CHECK-DAG: OpName [[OLT:%.*]] "test_olt" +; CHECK-DAG: OpName [[ULE:%.*]] "test_ule" +; CHECK-DAG: OpName [[OLE:%.*]] "test_ole" +; CHECK-DAG: OpName [[UGT:%.*]] "test_ugt" +; CHECK-DAG: OpName [[OGT:%.*]] "test_ogt" +; CHECK-DAG: OpName [[UGE:%.*]] "test_uge" +; CHECK-DAG: OpName [[OGE:%.*]] "test_oge" +; CHECK-DAG: OpName [[UNO:%.*]] "test_uno" +; CHECK-DAG: OpName [[ORD:%.*]] "test_ord" +; CHECK-DAG: OpName [[v3UEQ:%.*]] "test_v3_ueq" +; CHECK-DAG: OpName [[v3OEQ:%.*]] "test_v3_oeq" +; CHECK-DAG: OpName [[v3UNE:%.*]] "test_v3_une" +; CHECK-DAG: OpName [[v3ONE:%.*]] "test_v3_one" +; CHECK-DAG: OpName [[v3ULT:%.*]] "test_v3_ult" +; CHECK-DAG: OpName [[v3OLT:%.*]] "test_v3_olt" +; CHECK-DAG: OpName [[v3ULE:%.*]] "test_v3_ule" +; CHECK-DAG: OpName [[v3OLE:%.*]] "test_v3_ole" +; CHECK-DAG: OpName [[v3UGT:%.*]] "test_v3_ugt" +; CHECK-DAG: OpName [[v3OGT:%.*]] "test_v3_ogt" +; CHECK-DAG: OpName [[v3UGE:%.*]] "test_v3_uge" +; CHECK-DAG: OpName [[v3OGE:%.*]] "test_v3_oge" +; CHECK-DAG: OpName [[v3UNO:%.*]] "test_v3_uno" +; CHECK-DAG: OpName [[v3ORD:%.*]] "test_v3_ord" +; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0 +; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 3 + +; CHECK: [[UEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ueq(bfloat %a, bfloat %b) { + %r = fcmp ueq bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_oeq(bfloat %a, bfloat %b) { + %r = fcmp oeq bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UNE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_une(bfloat %a, bfloat %b) { + %r = fcmp une bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ONE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_one(bfloat %a, bfloat %b) { + %r = fcmp one bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ULT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ult(bfloat %a, bfloat %b) { + %r = fcmp ult bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OLT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_olt(bfloat %a, bfloat %b) { + %r = fcmp olt bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ULE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ule(bfloat %a, bfloat %b) { + %r = fcmp ule bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OLE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ole(bfloat %a, bfloat %b) { + %r = fcmp ole bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ugt(bfloat %a, bfloat %b) { + %r = fcmp ugt bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ogt(bfloat %a, bfloat %b) { + %r = fcmp ogt bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_uge(bfloat %a, bfloat %b) { + %r = fcmp uge bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_oge(bfloat %a, bfloat %b) { + %r = fcmp oge bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ORD]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ord(bfloat %a, bfloat %b) { + %r = fcmp ord bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UNO]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_uno(bfloat %a, bfloat %b) { + %r = fcmp uno bfloat %a, %b + ret i1 %r +} + +; CHECK: [[v3UEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ueq(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ueq <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_oeq(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp oeq <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UNE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_une(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp une <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ONE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_one(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp one <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ULT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ult(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ult <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OLT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_olt(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp olt <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ULE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ule(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ule <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OLE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ole(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ole <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ugt(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ugt <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ogt(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ogt <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_uge(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp uge <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_oge(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp oge <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ORD]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ord(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ord <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UNO]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_uno(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp uno <3 x bfloat> %a, %b + ret <3 x i1> %r +} From 71022d1ed6f1446fde4ca13f21259c5e550af0f7 Mon Sep 17 00:00:00 2001 From: Sirraide <aeternalmail@gmail.com> Date: Tue, 4 Nov 2025 17:13:23 +0100 Subject: [PATCH 181/313] [Clang] [Docs] Add some CMake example code for linking against libclang (#166268) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Though we have a few code examples in our documentation that show how to *use* libclang, we never actually show how to *link* against it. I myself mostly figured this out through trial and error some time ago, and I’ve since had to explain it to others on several occasions, so I thought adding some very minimal CMake example code might be helpful. --- clang/docs/LibClang.rst | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/clang/docs/LibClang.rst b/clang/docs/LibClang.rst index e747022b9c173..6c62bcb5f8c29 100644 --- a/clang/docs/LibClang.rst +++ b/clang/docs/LibClang.rst @@ -38,6 +38,7 @@ Code example .. code-block:: cpp + // main.cpp #include <clang-c/Index.h> #include <iostream> @@ -57,6 +58,22 @@ Code example CXCursor cursor = clang_getTranslationUnitCursor(unit); //Obtain a cursor at the root of the translation unit } +.. code-block:: cmake + + # CMakeLists.txt + cmake_minimum_required(VERSION 3.20) + project(my_clang_tool VERSION 0.1.0) + + # This will find the default system installation of Clang; if you want to + # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang + # to the CMake configure command, where /foobar is the build directory where + # you built Clang. + find_package(Clang CONFIG REQUIRED) + + add_executable(my_clang_tool main.cpp) + target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS}) + target_link_libraries(my_clang_tool PRIVATE libclang) + Visiting elements of an AST ~~~~~~~~~~~~~~~~~~~~~~~~~~~ The elements of an AST can be recursively visited with pre-order traversal with ``clang_visitChildren``. @@ -283,6 +300,7 @@ Complete example code .. code-block:: cpp + // main.cpp #include <clang-c/Index.h> #include <iostream> @@ -356,6 +374,21 @@ Complete example code ); } +.. code-block:: cmake + + # CMakeLists.txt + cmake_minimum_required(VERSION 3.20) + project(my_clang_tool VERSION 0.1.0) + + # This will find the default system installation of Clang; if you want to + # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang + # to the CMake configure command, where /foobar is the build directory where + # you built Clang. + find_package(Clang CONFIG REQUIRED) + + add_executable(my_clang_tool main.cpp) + target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS}) + target_link_libraries(my_clang_tool PRIVATE libclang) .. _Index.h: https://github.com/llvm/llvm-project/blob/main/clang/include/clang-c/Index.h From 6c563dc6a2127e3f7dd8e957093e57bd3ba35f5b Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <dmitry.chigarev@intel.com> Date: Tue, 4 Nov 2025 17:19:47 +0100 Subject: [PATCH 182/313] [mlir][XeGPU] Add optional layout attribute to LoadGather StoreScatter ops (#163414) As [suggested here](https://github.com/llvm/llvm-project/pull/163071#discussion_r2427229637) the PR adds an optional layout attribute for `LoadGather` and `StoreScatter` ops. For the load-op the attribute describes the layout of the result (ex `layout_result_0`), and for store-op it describes the layout for the vector-to-store operand (ex `layout_operand_0`). The PR also reworks `propagate-layout` pass to consider perm layout attributes and back-propagate them accordingly. The helper utility function `getDistributeLayoutAttr` is reworked to return either `layout_operand/result_0` or `layout` for load/store ops (denepding on which one is set). After an offline discussion decided that the overall utilities layouts API is confusing since it tries to mix permament and temporary layouts. Would need to change it in the future. --------- Signed-off-by: dchigarev <dmitry.chigarev@intel.com> --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 24 +++++-- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 6 +- .../VectorToXeGPU/VectorToXeGPU.cpp | 12 ++-- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 41 ++++++++++-- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 15 +++-- .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 12 +++- .../Transforms/XeGPUWgToSgDistribute.cpp | 16 ++--- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 65 +++++++++++++++++-- mlir/test/Dialect/XeGPU/propagate-layout.mlir | 40 ++++++++++++ .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 2 +- 10 files changed, 200 insertions(+), 33 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 426377fcf598f..689ebd0d1179a 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -843,7 +843,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size, OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint, OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint, - OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint); + OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint, + OptionalAttr<XeGPU_LayoutAttr>:$layout); let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value); let extraClassDeclaration = extraBaseClassDeclaration # [{ @@ -895,7 +896,14 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { "IntegerAttr": $chunk_size, "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)> + "xegpu::CachePolicyAttr": $l3_hint)>, + OpBuilder<(ins "Type": $value, "Value": $source, + "ArrayRef<OpFoldResult>": $offsets, "Value": $mask, + "IntegerAttr": $chunk_size, + "xegpu::CachePolicyAttr": $l1_hint, + "xegpu::CachePolicyAttr": $l2_hint, + "xegpu::CachePolicyAttr": $l3_hint, + "xegpu::LayoutAttr": $layout)> ]; let hasVerifier = 1; @@ -979,7 +987,8 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size, OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint, OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint, - OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint); + OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint, + OptionalAttr<XeGPU_LayoutAttr>:$layout); let extraClassDeclaration = extraBaseClassDeclaration#[{ Type getDestType() { @@ -1030,7 +1039,14 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { "IntegerAttr": $chunk_size, "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)> + "xegpu::CachePolicyAttr": $l3_hint)>, + OpBuilder<(ins "Value": $value, "Value": $dest, + "ArrayRef<OpFoldResult>": $offsets, "Value": $mask, + "IntegerAttr": $chunk_size, + "xegpu::CachePolicyAttr": $l1_hint, + "xegpu::CachePolicyAttr": $l2_hint, + "xegpu::CachePolicyAttr": $l3_hint, + "xegpu::LayoutAttr": $layout)> ]; let hasVerifier = 1; diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 04cfd58d846a7..620a2fe43d682 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -104,11 +104,15 @@ void removeLayoutAttrs(Operation *op); /// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching /// it to the owner's dictionary attributes +/// If `respectPermLayout` is true the existing permament layout +/// attribute will be kept and assigned to the attribute dict instead +/// of the provided layout. template <typename T, typename = std::enable_if_t<std::is_same_v<T, OpOperand> || std::is_same_v<T, OpResult>>> void setDistributeLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout); + const DistributeLayoutAttr layout, + bool respectPermLayout = false); /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given /// operation. If the operation contains regions, it is also applied recursively diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index abea84f6b01fe..1b4d1a42614ea 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -457,7 +457,8 @@ static LogicalResult lowerToScatteredLoadOp(vector::TransferReadOp readOp, /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); rewriter.replaceOp(readOp, gatherOp.getResult()); return success(); @@ -491,7 +492,8 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp, /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); rewriter.eraseOp(writeOp); return success(); } @@ -646,7 +648,8 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> { /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); auto selectOp = arith::SelectOp::create(rewriter, loc, gatherOp.getMask(), @@ -680,7 +683,8 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> { /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); rewriter.eraseOp(scatterOp); return success(); } diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index fb51077b5dff3..4dd10bedc6d84 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -876,7 +876,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, xegpu::CachePolicyAttr l2_hint, xegpu::CachePolicyAttr l3_hint) { build(builder, state, valueType, source, Value(), mask, IntegerAttr(), - l1_hint, l2_hint, l3_hint); + l1_hint, l2_hint, l3_hint, /*layout=*/nullptr); } void LoadGatherOp::build(OpBuilder &builder, OperationState &state, @@ -892,7 +892,24 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, auto offset = vector::FromElementsOp::create(builder, loc, type, values); build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint, - l2_hint, l3_hint); + l2_hint, l3_hint, /*layout=*/nullptr); +} + +void LoadGatherOp::build(OpBuilder &builder, OperationState &state, + Type valueType, Value source, + ArrayRef<OpFoldResult> offsets, Value mask, + IntegerAttr chunk_size, xegpu::CachePolicyAttr l1_hint, + xegpu::CachePolicyAttr l2_hint, + xegpu::CachePolicyAttr l3_hint, + xegpu::LayoutAttr layout) { + auto loc = source.getLoc(); + int64_t size = static_cast<int64_t>(offsets.size()); + auto type = VectorType::get(size, builder.getIndexType()); + auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets); + auto offset = vector::FromElementsOp::create(builder, loc, type, values); + + build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint, + l2_hint, l3_hint, layout); } //===----------------------------------------------------------------------===// @@ -943,7 +960,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state, xegpu::CachePolicyAttr l2_hint, xegpu::CachePolicyAttr l3_hint) { build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint, - l2_hint, l3_hint); + l2_hint, l3_hint, /*layout=*/nullptr); } void StoreScatterOp::build(OpBuilder &builder, OperationState &state, @@ -961,7 +978,23 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state, // Call the correct builder overload that does not expect result types. build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint, - l3_hint); + l3_hint, /*layout=*/nullptr); +} + +void StoreScatterOp::build( + OpBuilder &builder, OperationState &state, Value value, Value dest, + ArrayRef<OpFoldResult> offsets, Value mask, IntegerAttr chunk_size, + xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint, + xegpu::CachePolicyAttr l3_hint, xegpu::LayoutAttr layout) { + auto loc = dest.getLoc(); + int64_t size = static_cast<int64_t>(offsets.size()); + auto type = VectorType::get(size, builder.getIndexType()); + auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets); + auto offset = vector::FromElementsOp::create(builder, loc, type, values); + + // Call the correct builder overload that does not expect result types. + build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint, + l3_hint, layout); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 90eae871a5ef3..14c49e7f45706 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -904,9 +904,16 @@ void LayoutInfoPropagation::visitStoreScatterOp( if (dstTdescTy.getChunkSizeAsInt() > 1) instData.push_back(chunkSize); } - LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo( - payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), - /*scattered=*/true); + + LayoutInfo payloadLayout; + + if (auto layout = storeScatter.getLayoutAttr()) { + payloadLayout = LayoutInfo(layout); + } else { + payloadLayout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), + /*scattered=*/true); + } LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); @@ -1041,7 +1048,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } // If the result is a vector type, add a temporary layout attribute to the // op. - xegpu::setDistributeLayoutAttr(result, layout); + xegpu::setDistributeLayoutAttr(result, layout, /*respectPermLayout*/ true); } return success(); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index e6e71cc29a80a..c3bf9606693a8 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -678,12 +678,16 @@ struct UnrollLoadGatherOpWithOffset pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter); } + auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr()); + if (layout) + layout = layout.dropInstData(); + SmallVector<Value> newOps; for (auto [o, m] : llvm::zip(convertedOffsets, convertedMasks)) { auto newOp = xegpu::LoadGatherOp::create( rewriter, loc, newValueTy, op.getSource(), o, m, rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(), - op.getL2HintAttr(), op.getL3HintAttr()); + op.getL2HintAttr(), op.getL3HintAttr(), layout); newOps.push_back(newOp); } @@ -774,12 +778,16 @@ struct UnrollStoreScatterOpWithOffsets SmallVector<Value> convertedValues = pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); + auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr()); + if (layout) + layout = layout.dropInstData(); + for (auto [v, o, m] : llvm::zip(convertedValues, convertedOffsets, convertedMasks)) { xegpu::StoreScatterOp::create(rewriter, loc, v, op.getDest(), o, m, rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(), op.getL2HintAttr(), - op.getL3HintAttr()); + op.getL3HintAttr(), layout); } rewriter.eraseOp(op); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 79eea55c8b78a..d12a04df5c46c 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -889,8 +889,8 @@ struct WgToSgLoadGatherOpWithOffset return failure(); ArrayRef<int64_t> wgShape = resultType.getShape(); - xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>( + xegpu::getDistributeLayoutAttr(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -915,9 +915,8 @@ struct WgToSgLoadGatherOpWithOffset llvm::zip(adaptor.getOffsets(), adaptor.getMask())) { auto newLoadOp = xegpu::LoadGatherOp::create( rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr, - op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr()); - xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0), - layout.dropSgLayoutAndData()); + op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(), + layout.dropSgLayoutAndData()); newLoadOps.push_back(newLoadOp); } rewriter.replaceOpWithMultiple(op, {newLoadOps}); @@ -942,8 +941,8 @@ struct WgToSgStoreScatterOpWithOffset if (!valueType) return failure(); - xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getOperand(0)); + xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>( + xegpu::getDistributeLayoutAttr(op.getOperand(0))); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -965,7 +964,8 @@ struct WgToSgStoreScatterOpWithOffset adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) { auto store = xegpu::StoreScatterOp::create( rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr, - op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr()); + op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(), + layout.dropSgLayoutAndData()); // Update the layout attribute to drop sg_layout and sg_data. if (!layout.getEffectiveLaneLayoutAsInt().empty() || !layout.getEffectiveInstDataAsInt().empty()) { diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index a38993e0c55b1..d575a415a3035 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -144,6 +144,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName); + + // check for "permament" layout only after "temporary" layout name lookup + // for backward compatibility + if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp)) + return loadGatherOp.getLayoutAttr(); } if (auto arg = dyn_cast<BlockArgument>(value)) { @@ -171,27 +176,77 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName); + + // check for "permament" layout only after "temporary" layout name lookup + if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op)) + if (auto layout = storeScatterOp.getLayoutAttr()) + return layout; + return getDistributeLayoutAttr(opr.get()); } +// Returns the permanent layout attribute for the given result if it's +// available on the defining op. Otherwise returns the provided layout. +xegpu::DistributeLayoutAttr +maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, + const OpResult &result, mlir::Operation *owner, + const std::string &name) { + xegpu::DistributeLayoutAttr candidate = layout; + + if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) { + if (auto perm = loadOp.getLayoutAttr()) + candidate = perm; + } + + return candidate; +} + +// Returns the permanent layout attribute for the given operand if it's +// available on the defining op. Otherwise returns the provided layout. +xegpu::DistributeLayoutAttr +maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, + const OpOperand &operand, mlir::Operation *owner, + const std::string &name) { + xegpu::DistributeLayoutAttr candidate = layout; + unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber(); + + if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) { + if (idx == 0) { + if (auto perm = storeOp.getLayoutAttr()) + candidate = perm; + } + } + + return candidate; +} + template <typename T, typename> void xegpu::setDistributeLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout) { + const DistributeLayoutAttr layout, + bool respectPermLayout) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getLayoutName(operandOrResult); - if (layout && !owner->hasAttrOfType<DistributeLayoutAttr>(name)) - owner->setAttr(name, layout); + + if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) + return; + + DistributeLayoutAttr candidate = layout; + if (respectPermLayout) + candidate = maybePickPermanentLayout(layout, operandOrResult, owner, name); + + if (candidate) + owner->setAttr(name, candidate); } // Explicit instantiation for OpResult template void xegpu::setDistributeLayoutAttr<mlir::OpResult>( const mlir::OpResult &result, - const mlir::xegpu::DistributeLayoutAttr layout); + const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout); // Explicit instantiation for OpOperand template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>( const mlir::OpOperand &operand, - const mlir::xegpu::DistributeLayoutAttr layout); + const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout); void xegpu::setDistributeLayoutAttrs( Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) { diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 543e119d81d88..61e315d0d2080 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -215,6 +215,46 @@ func.func @scatter_ops(%src: memref<256xf16>) { } // ----- gpu.module @test { +// CHECK-LABEL: func.func @scatter_ops_custom_perm_layout( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> +// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> +// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16> +// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] +// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) { + %1 = arith.constant dense<1>: vector<16xi1> + %offset = arith.constant dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> + %4 = arith.addf %3, %3 : vector<16xf16> + xegpu.store %4, %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + return +} +} +// ----- +gpu.module @test { +// CHECK-LABEL: func.func @scatter_ops_preserve_load_perm_layout( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> +// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> +// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16> +// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] +// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) { + %1 = arith.constant dense<1>: vector<16xi1> + %offset = arith.constant dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> + %4 = arith.addf %3, %3 : vector<16xf16> + xegpu.store %4, %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + return +} +} +// ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( // CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 52acde4dffc2e..8d98fcfd0d2c2 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -285,7 +285,7 @@ gpu.module @test_distribution { // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16> // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex> // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1> - // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> + // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}> // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>, // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>} // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> From a50d036c0bf456f4d2b6d6a19f5e45f61860994c Mon Sep 17 00:00:00 2001 From: Manuel Carrasco <Manuel.Carrasco@amd.com> Date: Tue, 4 Nov 2025 16:21:47 +0000 Subject: [PATCH 183/313] [NFC] [Build Fix] Fix failing test case due to missing host arch. (#166392) This fixes a typo introduced in #165606 which makes the test case fail. --------- Co-authored-by: Joseph Huber <huberjn@outlook.com> --- clang/test/Driver/hip-spirv-translator-new-driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c index 315a74635b9b3..67d894e2eb506 100644 --- a/clang/test/Driver/hip-spirv-translator-new-driver.c +++ b/clang/test/Driver/hip-spirv-translator-new-driver.c @@ -2,7 +2,7 @@ // The input and output files cannot be the same. // RUN: %clang --offload-new-driver -### -save-temps -nogpuinc -nogpulib \ -// RUN: --offload-arch=amdgcnspirv -x hip %s 2>&1 \ +// RUN: --target=x86_64-unknown-linux-gnu --offload-arch=amdgcnspirv -x hip %s 2>&1 \ // RUN: | FileCheck %s // CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]" From 4749bf56a65e38ee7b05ac7f9fe261aab6cb5bc6 Mon Sep 17 00:00:00 2001 From: Michael Buch <michaelbuch12@gmail.com> Date: Tue, 4 Nov 2025 16:24:24 +0000 Subject: [PATCH 184/313] [lldb] When starting in a hidden frame, don't skip over hidden frames when navigating up/down (#166394) When stopped in a hidden frame (either because we selected the hidden frame or hit a breakpoint inside it), a user most likely is intersted in exploring the immediate frames around it. But currently issuing `up`/`down` commands will unconditionally skip over all hidden frames. This patch makes it so `up`/`down` commands don't skip hidden frames if the frame we started it was a hidden frame. --- lldb/source/Commands/CommandObjectFrame.cpp | 52 +++++++++++-------- .../API/commands/frame/select-hidden/Makefile | 3 ++ .../select-hidden/TestNavigateHiddenFrame.py | 32 ++++++++++++ .../API/commands/frame/select-hidden/main.cpp | 13 +++++ 4 files changed, 78 insertions(+), 22 deletions(-) create mode 100644 lldb/test/API/commands/frame/select-hidden/Makefile create mode 100644 lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py create mode 100644 lldb/test/API/commands/frame/select-hidden/main.cpp diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp index 88a02dce35b9d..9133359fbf537 100644 --- a/lldb/source/Commands/CommandObjectFrame.cpp +++ b/lldb/source/Commands/CommandObjectFrame.cpp @@ -265,6 +265,29 @@ class CommandObjectFrameSelect : public CommandObjectParsed { Options *GetOptions() override { return &m_options; } +private: + void SkipHiddenFrames(Thread &thread, uint32_t frame_idx) { + uint32_t candidate_idx = frame_idx; + const unsigned max_depth = 12; + for (unsigned num_try = 0; num_try < max_depth; ++num_try) { + if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) { + candidate_idx = UINT32_MAX; + break; + } + candidate_idx += *m_options.relative_frame_offset; + if (auto candidate_sp = thread.GetStackFrameAtIndex(candidate_idx)) { + if (candidate_sp->IsHidden()) + continue; + // Now candidate_idx is the first non-hidden frame. + break; + } + candidate_idx = UINT32_MAX; + break; + }; + if (candidate_idx != UINT32_MAX) + m_options.relative_frame_offset = candidate_idx - frame_idx; + } + protected: void DoExecute(Args &command, CommandReturnObject &result) override { // No need to check "thread" for validity as eCommandRequiresThread ensures @@ -278,28 +301,13 @@ class CommandObjectFrameSelect : public CommandObjectParsed { if (frame_idx == UINT32_MAX) frame_idx = 0; - // If moving up/down by one, skip over hidden frames. - if (*m_options.relative_frame_offset == 1 || - *m_options.relative_frame_offset == -1) { - uint32_t candidate_idx = frame_idx; - const unsigned max_depth = 12; - for (unsigned num_try = 0; num_try < max_depth; ++num_try) { - if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) { - candidate_idx = UINT32_MAX; - break; - } - candidate_idx += *m_options.relative_frame_offset; - if (auto candidate_sp = thread->GetStackFrameAtIndex(candidate_idx)) { - if (candidate_sp->IsHidden()) - continue; - // Now candidate_idx is the first non-hidden frame. - break; - } - candidate_idx = UINT32_MAX; - break; - }; - if (candidate_idx != UINT32_MAX) - m_options.relative_frame_offset = candidate_idx - frame_idx; + // If moving up/down by one, skip over hidden frames, unless we started + // in a hidden frame. + if ((*m_options.relative_frame_offset == 1 || + *m_options.relative_frame_offset == -1)) { + if (auto current_frame_sp = thread->GetStackFrameAtIndex(frame_idx); + !current_frame_sp->IsHidden()) + SkipHiddenFrames(*thread, frame_idx); } if (*m_options.relative_frame_offset < 0) { diff --git a/lldb/test/API/commands/frame/select-hidden/Makefile b/lldb/test/API/commands/frame/select-hidden/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/commands/frame/select-hidden/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py new file mode 100644 index 0000000000000..698447b552877 --- /dev/null +++ b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py @@ -0,0 +1,32 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class NavigateHiddenFrameTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + @add_test_categories(["libc++"]) + def test(self): + """Test going up/down a backtrace but we started in a hidden frame.""" + self.build() + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec("main.cpp") + ) + # up + self.assertIn("__impl2", thread.selected_frame.GetFunctionName()) + self.expect("up") + self.assertIn("__impl1", thread.selected_frame.GetFunctionName()) + self.expect("up") + self.assertIn("__impl", thread.selected_frame.GetFunctionName()) + self.expect("up") + self.assertIn("non_impl", thread.selected_frame.GetFunctionName()) + + # Back down again. + self.expect("down") + self.assertIn("__impl", thread.selected_frame.GetFunctionName()) + self.expect("down") + self.assertIn("__impl1", thread.selected_frame.GetFunctionName()) + self.expect("down") + self.assertIn("__impl2", thread.selected_frame.GetFunctionName()) diff --git a/lldb/test/API/commands/frame/select-hidden/main.cpp b/lldb/test/API/commands/frame/select-hidden/main.cpp new file mode 100644 index 0000000000000..dc97abb6323a4 --- /dev/null +++ b/lldb/test/API/commands/frame/select-hidden/main.cpp @@ -0,0 +1,13 @@ +namespace std { +namespace __1 { +static const char *__impl2() { return "Break here"; } +static const char *__impl1() { return __impl2(); } +static const char *__impl() { return __impl1(); } +static const char *non_impl() { return __impl(); } +} // namespace __1 +} // namespace std + +int main() { + std::__1::non_impl(); + __builtin_debugtrap(); +} From ed7d6c3511df7f5d1dbf52579740f7f4e4ada4f9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata <kazu@google.com> Date: Tue, 4 Nov 2025 08:28:51 -0800 Subject: [PATCH 185/313] [ADT] Deprecate a soft-deprecated APInt constructor (#166314) This patch deprecates an APInt constructor that has been soft-deprecated via comments since: commit 7a16288157efc5fb85fbe3b8b4c37071da7609a6 Author: Jeffrey Yasskin <jyasskin@google.com> Date: Mon Jul 18 21:45:40 2011 +0000 This patch updates a small number of remaining uses. --- lldb/source/Utility/RegisterValue.cpp | 8 +++++--- llvm/include/llvm/ADT/APInt.h | 1 + polly/lib/Support/GICHelper.cpp | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lldb/source/Utility/RegisterValue.cpp b/lldb/source/Utility/RegisterValue.cpp index 12c349a143c0f..8b2af4e3d4f0e 100644 --- a/lldb/source/Utility/RegisterValue.cpp +++ b/lldb/source/Utility/RegisterValue.cpp @@ -206,7 +206,7 @@ Status RegisterValue::SetValueFromData(const RegisterInfo ®_info, int128.x[0] = data2; int128.x[1] = data1; } - SetUInt128(llvm::APInt(128, 2, int128.x)); + SetUInt128(llvm::APInt(128, int128.x)); } break; case eEncodingIEEE754: @@ -596,8 +596,10 @@ llvm::APInt RegisterValue::GetAsUInt128(const llvm::APInt &fail_value, case 8: case 16: return llvm::APInt( - BITWIDTH_INT128, NUM_OF_WORDS_INT128, - (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x); + BITWIDTH_INT128, + llvm::ArrayRef( + (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x, + NUM_OF_WORDS_INT128)); } } break; } diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 26283d2437d48..fdb3b84b73a1f 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -154,6 +154,7 @@ class [[nodiscard]] APInt { /// Once all uses of this constructor are migrated to other constructors, /// consider marking this overload ""= delete" to prevent calls from being /// incorrectly bound to the APInt(unsigned, uint64_t, bool) constructor. + [[deprecated("Use other constructors of APInt")]] LLVM_ABI APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]); /// Construct an APInt from a string representation. diff --git a/polly/lib/Support/GICHelper.cpp b/polly/lib/Support/GICHelper.cpp index 027e0194732f4..948bb6a9b9614 100644 --- a/polly/lib/Support/GICHelper.cpp +++ b/polly/lib/Support/GICHelper.cpp @@ -59,7 +59,7 @@ APInt polly::APIntFromVal(__isl_take isl_val *Val) { Data = (uint64_t *)malloc(NumChunks * ChunkSize); isl_val_get_abs_num_chunks(Val, ChunkSize, Data); int NumBits = CHAR_BIT * ChunkSize * NumChunks; - APInt A(NumBits, NumChunks, Data); + APInt A(NumBits, ArrayRef(Data, NumChunks)); // As isl provides only an interface to obtain data that describes the // absolute value of an isl_val, A at this point always contains a positive From 78769d51c6b183b6d88fbd1ef825fff7cf3aad21 Mon Sep 17 00:00:00 2001 From: Adrian Prantl <aprantl@apple.com> Date: Tue, 4 Nov 2025 08:29:47 -0800 Subject: [PATCH 186/313] [LLDB] Don't check for libcxx if LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS is off --- lldb/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt index 513d1ec493ee1..818dff58aceeb 100644 --- a/lldb/test/CMakeLists.txt +++ b/lldb/test/CMakeLists.txt @@ -202,7 +202,7 @@ if(TARGET clang) else() # We require libcxx for the test suite, so if we aren't building it, # provide a helpful error about how to resolve the situation. - if(NOT LLDB_HAS_LIBCXX) + if(LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS AND NOT LLDB_HAS_LIBCXX) message(SEND_ERROR "LLDB test suite requires libc++, but it is currently disabled. " "Please add `libcxx` to `LLVM_ENABLE_RUNTIMES` or disable tests via " From cc3ad201ecd50bcc85d1488084dda079c0671652 Mon Sep 17 00:00:00 2001 From: Marco Borgeaud <marco.borgeaud@sonarsource.com> Date: Tue, 4 Nov 2025 17:39:23 +0100 Subject: [PATCH 187/313] [analyzer] Revert incorrect LazyCoumpoundVal changes (#163461) Reverts #115917 and its follow up #116840. Fixes #153782 and introduces regression tests. Reopens #114270. --- clang/lib/StaticAnalyzer/Core/RegionStore.cpp | 48 ----------- .../test/Analysis/NewDelete-checker-test.cpp | 80 ++++++++++++++++++- clang/test/Analysis/ctor-trivial-copy.cpp | 62 ++++++++++---- clang/test/Analysis/explain-svals.cpp | 2 +- clang/test/Analysis/iterator-modeling.cpp | 2 - ...-modeling-aggressive-std-find-modeling.cpp | 10 --- .../test/Analysis/stl-algorithm-modeling.cpp | 10 --- clang/test/Analysis/store-dump-orders.cpp | 2 +- clang/test/Analysis/taint-generic.cpp | 6 +- .../test/Analysis/template-param-objects.cpp | 2 +- 10 files changed, 131 insertions(+), 93 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp index 2838533c1a406..4f4824a3616ce 100644 --- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp +++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp @@ -714,11 +714,6 @@ class RegionStoreManager : public StoreManager { return getBinding(getRegionBindings(S), L, T); } - std::optional<SVal> getUniqueDefaultBinding(RegionBindingsConstRef B, - const TypedValueRegion *R) const; - std::optional<SVal> - getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const; - std::optional<SVal> getDefaultBinding(Store S, const MemRegion *R) override { RegionBindingsRef B = getRegionBindings(S); // Default bindings are always applied over a base region so look up the @@ -2465,11 +2460,6 @@ SVal RegionStoreManager::getBindingForStruct(RegionBindingsConstRef B, // behavior doesn't depend on the struct layout. // This way even an empty struct can carry taint, no matter if creduce drops // the last field member or not. - - // Try to avoid creating a LCV if it would anyways just refer to a single - // default binding. - if (std::optional<SVal> Val = getUniqueDefaultBinding(B, R)) - return *Val; return createLazyBinding(B, R); } @@ -2757,50 +2747,12 @@ RegionStoreManager::bindVector(LimitedRegionBindingsConstRef B, return NewB; } -std::optional<SVal> -RegionStoreManager::getUniqueDefaultBinding(RegionBindingsConstRef B, - const TypedValueRegion *R) const { - if (R != R->getBaseRegion()) - return std::nullopt; - - const auto *Cluster = B.lookup(R); - if (!Cluster || !llvm::hasSingleElement(*Cluster)) - return std::nullopt; - - const auto [Key, Value] = *Cluster->begin(); - return Key.isDirect() ? std::optional<SVal>{} : Value; -} - -std::optional<SVal> -RegionStoreManager::getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const { - auto B = getRegionBindings(LCV.getStore()); - return getUniqueDefaultBinding(B, LCV.getRegion()); -} - std::optional<LimitedRegionBindingsRef> RegionStoreManager::tryBindSmallStruct( LimitedRegionBindingsConstRef B, const TypedValueRegion *R, const RecordDecl *RD, nonloc::LazyCompoundVal LCV) { if (B.hasExhaustedBindingLimit()) return B.withValuesEscaped(LCV); - // If we try to copy a Conjured value representing the value of the whole - // struct, don't try to element-wise copy each field. - // That would unnecessarily bind Derived symbols slicing off the subregion for - // the field from the whole Conjured symbol. - // - // struct Window { int width; int height; }; - // Window getWindow(); <-- opaque fn. - // Window w = getWindow(); <-- conjures a new Window. - // Window w2 = w; <-- trivial copy "w", calling "tryBindSmallStruct" - // - // We should not end up with a new Store for "w2" like this: - // Direct [ 0..31]: Derived{Conj{}, w.width} - // Direct [32..63]: Derived{Conj{}, w.height} - // Instead, we should just bind that Conjured value instead. - if (std::optional<SVal> Val = getUniqueDefaultBinding(LCV)) { - return B.addBinding(BindingKey::Make(R, BindingKey::Default), Val.value()); - } - FieldVector Fields; if (const CXXRecordDecl *Class = dyn_cast<CXXRecordDecl>(RD)) diff --git a/clang/test/Analysis/NewDelete-checker-test.cpp b/clang/test/Analysis/NewDelete-checker-test.cpp index c417b9c2ac97e..fd831cc0985cc 100644 --- a/clang/test/Analysis/NewDelete-checker-test.cpp +++ b/clang/test/Analysis/NewDelete-checker-test.cpp @@ -3,13 +3,13 @@ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDelete // -// RUN: %clang_analyze_cc1 -DLEAKS -std=c++11 -fblocks %s \ +// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \ // RUN: -verify=expected,newdelete,leak \ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDelete \ // RUN: -analyzer-checker=cplusplus.NewDeleteLeaks // -// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -verify %s \ +// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \ // RUN: -verify=expected,leak \ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDeleteLeaks @@ -19,13 +19,13 @@ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDelete // -// RUN: %clang_analyze_cc1 -DLEAKS -std=c++17 -fblocks %s \ +// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \ // RUN: -verify=expected,newdelete,leak \ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDelete \ // RUN: -analyzer-checker=cplusplus.NewDeleteLeaks // -// RUN: %clang_analyze_cc1 -std=c++17 -fblocks -verify %s \ +// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \ // RUN: -verify=expected,leak,inspection \ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDeleteLeaks \ @@ -503,3 +503,75 @@ namespace optional_union { custom_union_t a; } // leak-warning{{Potential leak of memory pointed to by 'a.present.q'}} } + +namespace gh153782 { + +// Ensure we do not regress on the following use case. + +namespace mutually_exclusive_test_case_1 { +struct StorageWrapper { + // Imagine the destructor and copy constructor both call a reset() function (among other things). + ~StorageWrapper() { delete parts; } + StorageWrapper(StorageWrapper const&) = default; + + // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find. + void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}} + + // Not provided, typically would do `parts = new long`. + StorageWrapper(); + + long* parts; +}; + +void test_non_trivial_struct_assignment() { + StorageWrapper* object = new StorageWrapper[]{StorageWrapper()}; + object[0] = StorageWrapper(); // This assignment leads to the double-free. +} +} // mutually_exclusive_test_case_1 + +namespace mutually_exclusive_test_case_2 { +struct StorageWrapper { + // Imagine the destructor and copy constructor both call a reset() function (among other things). + ~StorageWrapper() { delete parts; } + StorageWrapper(StorageWrapper const&) = default; + + // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find. + void operator=(StorageWrapper&& other) { delete parts; } + + // Not provided, typically would do `parts = new long`. + StorageWrapper(); + + long* parts; +}; + +void test_non_trivial_struct_assignment() { + StorageWrapper* object = new StorageWrapper[]{StorageWrapper()}; + // object[0] = StorageWrapper(); // Remove the source of double free to make the potential leak appear. +} // leak-warning{{Potential leak of memory pointed to by 'object'}} +} // mutually_exclusive_test_case_2 + +namespace mutually_exclusive_test_case_3 { +struct StorageWrapper { + // Imagine the destructor and copy constructor both call a reset() function (among other things). + ~StorageWrapper() { delete parts; } + StorageWrapper(StorageWrapper const&) = default; + + // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find. + void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}} + + // Not provided, typically would do `parts = new long`. + StorageWrapper(); + + long* parts; +}; + +struct TestDoubleFreeWithInitializerList { + StorageWrapper* Object; + TestDoubleFreeWithInitializerList() + : Object(new StorageWrapper[]{StorageWrapper()}) { + Object[0] = StorageWrapper(); // This assignment leads to the double-free. + } +}; +} // mutually_exclusive_test_case_3 + +} // namespace gh153782 diff --git a/clang/test/Analysis/ctor-trivial-copy.cpp b/clang/test/Analysis/ctor-trivial-copy.cpp index 940ff9ba3ed9c..44990fc631d6d 100644 --- a/clang/test/Analysis/ctor-trivial-copy.cpp +++ b/clang/test/Analysis/ctor-trivial-copy.cpp @@ -5,8 +5,6 @@ void clang_analyzer_printState(); template <typename T> void clang_analyzer_dump_lref(T& param); template <typename T> void clang_analyzer_dump_val(T param); -template <typename T> void clang_analyzer_denote(T param, const char *name); -template <typename T> void clang_analyzer_express(T param); template <typename T> T conjure(); template <typename... Ts> void nop(const Ts &... args) {} @@ -42,10 +40,16 @@ void test_assign_return() { namespace trivial_struct_copy { void _01_empty_structs() { - clang_analyzer_dump_val(conjure<empty>()); // expected-warning {{conj_$}} + clang_analyzer_dump_val(conjure<empty>()); // expected-warning {{lazyCompoundVal}} empty Empty = conjure<empty>(); empty Empty2 = Empty; empty Empty3 = Empty2; + // All of these should refer to the exact same LCV, because all of + // these trivial copies refer to the original conjured value. + // There were Unknown before: + clang_analyzer_dump_val(Empty); // expected-warning {{lazyCompoundVal}} + clang_analyzer_dump_val(Empty2); // expected-warning {{lazyCompoundVal}} + clang_analyzer_dump_val(Empty3); // expected-warning {{lazyCompoundVal}} // We only have binding for the original Empty object, because copying empty // objects is a no-op in the performTrivialCopy. This is fine, because empty @@ -67,20 +71,18 @@ void _01_empty_structs() { } void _02_structs_with_members() { - clang_analyzer_dump_val(conjure<aggr>()); // expected-warning {{conj_$}} + clang_analyzer_dump_val(conjure<aggr>()); // expected-warning {{lazyCompoundVal}} aggr Aggr = conjure<aggr>(); aggr Aggr2 = Aggr; aggr Aggr3 = Aggr2; - // All of these should refer to the exact same symbol, because all of + // All of these should refer to the exact same LCV, because all of // these trivial copies refer to the original conjured value. - clang_analyzer_denote(Aggr, "$Aggr"); - clang_analyzer_express(Aggr); // expected-warning {{$Aggr}} - clang_analyzer_express(Aggr2); // expected-warning {{$Aggr}} - clang_analyzer_express(Aggr3); // expected-warning {{$Aggr}} - - // We should have the same Conjured symbol for "Aggr", "Aggr2" and "Aggr3". - // We used to have Derived symbols for the individual fields that were - // copied as part of copying the whole struct. + clang_analyzer_dump_val(Aggr); // expected-warning {{lazyCompoundVal}} + clang_analyzer_dump_val(Aggr2); // expected-warning {{lazyCompoundVal}} + clang_analyzer_dump_val(Aggr3); // expected-warning {{lazyCompoundVal}} + + // We have fields in the struct we copy, thus we also have the entries for the copies + // (and for all of their fields). clang_analyzer_printState(); // CHECK: "store": { "pointer": "0x{{[0-9a-f]+}}", "items": [ // CHECK-NEXT: { "cluster": "GlobalInternalSpaceRegion", "pointer": "0x{{[0-9a-f]+}}", "items": [ @@ -93,10 +95,12 @@ void _02_structs_with_members() { // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ:conj_\$[0-9]+{int, LC[0-9]+, S[0-9]+, #[0-9]+}]]" } // CHECK-NEXT: ]}, // CHECK-NEXT: { "cluster": "Aggr2", "pointer": "0x{{[0-9a-f]+}}", "items": [ - // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" } + // CHECK-NEXT: { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" }, + // CHECK-NEXT: { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" } // CHECK-NEXT: ]}, // CHECK-NEXT: { "cluster": "Aggr3", "pointer": "0x{{[0-9a-f]+}}", "items": [ - // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" } + // CHECK-NEXT: { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" }, + // CHECK-NEXT: { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" } // CHECK-NEXT: ]} // CHECK-NEXT: ]}, @@ -113,3 +117,31 @@ void entrypoint() { } } // namespace trivial_struct_copy + +namespace gh153782 { + +// Ensure we do not regress on the following use cases. +// The assumption made on a field in `setPtr` should apply to the returned copy in `func`. +struct Status { int error; }; +Status getError(); + +Status setPtr(int **outptr, int* ptr) { + Status e = getError(); + if (e.error != 0) return e; // When assuming the error field is non-zero, + *outptr = ptr; // this is not executed + return e; +} + +int func() { + int *ptr = nullptr; + int x = 42; + if (setPtr(&ptr, &x).error == 0) { + // The assumption made in get() SHOULD match the assumption about + // the returned value, hence the engine SHOULD NOT assume ptr is null. + clang_analyzer_dump_val(ptr); // expected-warning {{&x}} + return *ptr; + } + return 0; +} + +} // namespace gh153782 diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp index dfc650223c9e7..9474aa7c7dbb1 100644 --- a/clang/test/Analysis/explain-svals.cpp +++ b/clang/test/Analysis/explain-svals.cpp @@ -99,7 +99,7 @@ class C { } // end of anonymous namespace void test_6() { - clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \+0\)'$}}}} + clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^lazily frozen compound value of 1st parameter of function 'clang_analyzer_explain\(\)'$}}}} clang_analyzer_explain(conjure_S().z); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \)'\) for field 'z' of temporary object constructed at statement 'conjure_S\(\)'$}}}} } diff --git a/clang/test/Analysis/iterator-modeling.cpp b/clang/test/Analysis/iterator-modeling.cpp index 78882da4431fd..f1538839d06c8 100644 --- a/clang/test/Analysis/iterator-modeling.cpp +++ b/clang/test/Analysis/iterator-modeling.cpp @@ -2035,7 +2035,6 @@ void print_state(std::vector<int> &V) { // CHECK: "checker_messages": [ // CHECK: { "checker": "alpha.cplusplus.IteratorModeling", "messages": [ // CHECK-NEXT: "Iterator Positions :", - // CHECK-NEXT: "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}", // CHECK-NEXT: "i0 : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}" // CHECK-NEXT: ]} @@ -2046,7 +2045,6 @@ void print_state(std::vector<int> &V) { // CHECK: "checker_messages": [ // CHECK: { "checker": "alpha.cplusplus.IteratorModeling", "messages": [ // CHECK-NEXT: "Iterator Positions :", - // CHECK-NEXT: "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}", // CHECK-NEXT: "i1 : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}" // CHECK-NEXT: ]} diff --git a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp index 191af95cd2b9c..98301cf7274fc 100644 --- a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp +++ b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp @@ -4,16 +4,6 @@ // RUN: -analyzer-config alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling=true\ // RUN: -verify -// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because -// these tests started failing after we just directly copy the symbol -// representing the value of a variable instead of creating a LazyCompoundVal -// of that single conjured value. -// In theory, it shouldn't matter if we eagerly copy the value that we would -// "load" from the LCV once requested or just directly binding the backing symbol. -// Yet, these tests fail, so there is likely messed up how/what the checker -// metadata is associated with. -// XFAIL: * - #include "Inputs/system-header-simulator-cxx.h" void clang_analyzer_eval(bool); diff --git a/clang/test/Analysis/stl-algorithm-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling.cpp index f7029c79b0942..5549c24a8c220 100644 --- a/clang/test/Analysis/stl-algorithm-modeling.cpp +++ b/clang/test/Analysis/stl-algorithm-modeling.cpp @@ -3,16 +3,6 @@ // RUN: -analyzer-config aggressive-binary-operation-simplification=true\ // RUN: -verify -// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because -// these tests started failing after we just directly copy the symbol -// representing the value of a variable instead of creating a LazyCompoundVal -// of that single conjured value. -// In theory, it shouldn't matter if we eagerly copy the value that we would -// "load" from the LCV once requested or just directly binding the backing symbol. -// Yet, these tests fail, so there is likely messed up how/what the checker -// metadata is associated with. -// XFAIL: * - #include "Inputs/system-header-simulator-cxx.h" void clang_analyzer_eval(bool); diff --git a/clang/test/Analysis/store-dump-orders.cpp b/clang/test/Analysis/store-dump-orders.cpp index dbe93f1c5183a..d99f581f00fe1 100644 --- a/clang/test/Analysis/store-dump-orders.cpp +++ b/clang/test/Analysis/store-dump-orders.cpp @@ -41,7 +41,7 @@ void test_output(int n) { // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "conj_$ // CHECK-NEXT: ]}, // CHECK-NEXT: { "cluster": "objfirst", "pointer": "0x{{[0-9a-f]+}}", "items": [ - // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "conj_$ + // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "lazyCompoundVal // CHECK-NEXT: { "kind": "Direct", "offset": 320, "value": "1 S32b" }, // CHECK-NEXT: { "kind": "Direct", "offset": 352, "value": "2 S32b" }, // CHECK-NEXT: { "kind": "Direct", "offset": 384, "value": "3 S32b" } diff --git a/clang/test/Analysis/taint-generic.cpp b/clang/test/Analysis/taint-generic.cpp index fc7c37300d3fc..4b8d9ab68ff84 100644 --- a/clang/test/Analysis/taint-generic.cpp +++ b/clang/test/Analysis/taint-generic.cpp @@ -158,7 +158,11 @@ void top() { clang_analyzer_isTainted(E); // expected-warning {{NO}} Aggr A = mySource1<Aggr>(); - clang_analyzer_isTainted(A); // expected-warning {{YES}} + // FIXME Ideally, both A and A.data should be tainted. However, the + // implementation used by e5ac9145ba29 ([analyzer][taint] Recognize + // tainted LazyCompoundVals (4/4) (#115919), 2024-11-15) led to FPs and + // FNs in various scenarios and had to be reverted to fix #153782. + clang_analyzer_isTainted(A); // expected-warning {{NO}} clang_analyzer_isTainted(A.data); // expected-warning {{YES}} } } // namespace gh114270 diff --git a/clang/test/Analysis/template-param-objects.cpp b/clang/test/Analysis/template-param-objects.cpp index b065f8756d4d8..dde95fa62cb65 100644 --- a/clang/test/Analysis/template-param-objects.cpp +++ b/clang/test/Analysis/template-param-objects.cpp @@ -11,7 +11,7 @@ bool operator ==(Box lhs, Box rhs) { return lhs.value == rhs.value; } template <Box V> void dumps() { - clang_analyzer_dump(V); // expected-warning {{Unknown}} + clang_analyzer_dump(V); // expected-warning {{lazyCompoundVal}} clang_analyzer_dump(&V); // expected-warning {{Unknown}} clang_analyzer_dump(V.value); // expected-warning {{Unknown}} FIXME: It should be '6 S32b'. clang_analyzer_dump(&V.value); // expected-warning {{Unknown}} From 2dc0fa1000d651c902edad7d206785de7efc75f3 Mon Sep 17 00:00:00 2001 From: Michael Kruse <llvm-project@meinersbur.de> Date: Tue, 4 Nov 2025 17:43:17 +0100 Subject: [PATCH 188/313] [Flang] Nested directives are comments (#166348) Directives cannot be nested. A directive sentinel that appears within another directive should be ignored, and instead fall back to be treated as a line comment. Fixes: #165874 --- flang/lib/Parser/prescan.cpp | 2 +- flang/test/Parser/OpenMP/nested-directive.f90 | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 flang/test/Parser/OpenMP/nested-directive.f90 diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index efce8fc3d2e35..8cccd84f9fa19 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -557,7 +557,7 @@ bool Prescanner::MustSkipToEndOfLine() const { return true; // skip over ignored columns in right margin (73:80) } else if (*at_ == '!' && !inCharLiteral_ && (!inFixedForm_ || tabInCurrentLine_ || column_ != 6)) { - return !IsCompilerDirectiveSentinel(at_ + 1); + return InCompilerDirective() || !IsCompilerDirectiveSentinel(at_ + 1); } else { return false; } diff --git a/flang/test/Parser/OpenMP/nested-directive.f90 b/flang/test/Parser/OpenMP/nested-directive.f90 new file mode 100644 index 0000000000000..2a10bbe666bb8 --- /dev/null +++ b/flang/test/Parser/OpenMP/nested-directive.f90 @@ -0,0 +1,7 @@ +! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s 2>&1 | FileCheck %s --match-full-lines + +subroutine func + implicit none +! CHECK: !$OMP NOTHING + !$omp nothing !$omp Cannot nest directives inside directives; must be interpreted as a comment +end subroutine func From 89ec96b8b4f4a3115689b045cd64afae1c28044e Mon Sep 17 00:00:00 2001 From: Tim Corringham <timothy.corringham@amd.com> Date: Tue, 4 Nov 2025 17:04:39 +0000 Subject: [PATCH 189/313] [HLSL] Implement the f16tof32() intrinsic (#165860) Implement the f16tof32() intrinsic, including DXILand SPIRV codegen, and associated tests. Fixes #99112 --------- Co-authored-by: Tim Corringham <tcorring@amd.com> --- clang/include/clang/Basic/Builtins.td | 6 + clang/lib/CodeGen/CGHLSLBuiltins.cpp | 54 +++++++ .../lib/Headers/hlsl/hlsl_alias_intrinsics.h | 21 +++ clang/lib/Sema/SemaHLSL.cpp | 57 ++++++-- .../builtins/f16tof32-builtin.hlsl | 30 ++++ clang/test/CodeGenHLSL/builtins/f16tof32.hlsl | 30 ++++ .../SemaHLSL/BuiltIns/f16tof32-errors.hlsl | 134 ++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsDirectX.td | 3 + llvm/include/llvm/IR/IntrinsicsSPIRV.td | 3 + llvm/lib/Target/DirectX/DXIL.td | 9 ++ .../DirectX/DirectXTargetTransformInfo.cpp | 8 +- .../Target/SPIRV/SPIRVInstructionSelector.cpp | 4 + llvm/test/CodeGen/DirectX/f16tof32.ll | 57 ++++++++ llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll | 18 +++ 14 files changed, 419 insertions(+), 15 deletions(-) create mode 100644 clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl create mode 100644 clang/test/CodeGenHLSL/builtins/f16tof32.hlsl create mode 100644 clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl create mode 100644 llvm/test/CodeGen/DirectX/f16tof32.ll create mode 100644 llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 2b400b012d6ed..0275447e1090a 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -5235,6 +5235,12 @@ def HLSLGetSpirvSpecConstant : LangBuiltin<"HLSL_LANG">, HLSLScalarTemplate { let Prototype = "T(unsigned int, T)"; } +def HLSLF16ToF32 : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_elementwise_f16tof32"]; + let Attributes = [NoThrow, Const, CustomTypeChecking]; + let Prototype = "void(...)"; +} + // Builtins for XRay. def XRayCustomEvent : Builtin { let Spellings = ["__xray_customevent"]; diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index fbf4a5722caed..b6928ce7d9c44 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -160,6 +160,57 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) { return LastInst; } +static Value *handleElementwiseF16ToF32(CodeGenFunction &CGF, + const CallExpr *E) { + Value *Op0 = CGF.EmitScalarExpr(E->getArg(0)); + QualType Op0Ty = E->getArg(0)->getType(); + llvm::Type *ResType = CGF.FloatTy; + uint64_t NumElements = 0; + if (Op0->getType()->isVectorTy()) { + NumElements = + E->getArg(0)->getType()->castAs<clang::VectorType>()->getNumElements(); + ResType = + llvm::VectorType::get(ResType, ElementCount::getFixed(NumElements)); + } + if (!Op0Ty->hasUnsignedIntegerRepresentation()) + llvm_unreachable( + "f16tof32 operand must have an unsigned int representation"); + + if (CGF.CGM.getTriple().isDXIL()) + return CGF.Builder.CreateIntrinsic(ResType, Intrinsic::dx_legacyf16tof32, + ArrayRef<Value *>{Op0}, nullptr, + "hlsl.f16tof32"); + + if (CGF.CGM.getTriple().isSPIRV()) { + // We use the SPIRV UnpackHalf2x16 operation to avoid the need for the + // Int16 and Float16 capabilities + auto UnpackType = + llvm::VectorType::get(CGF.FloatTy, ElementCount::getFixed(2)); + if (NumElements == 0) { + // a scalar input - simply extract the first element of the unpacked + // vector + Value *Unpack = CGF.Builder.CreateIntrinsic( + UnpackType, Intrinsic::spv_unpackhalf2x16, ArrayRef<Value *>{Op0}); + return CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0); + } else { + // a vector input - build a congruent output vector by iterating through + // the input vector calling unpackhalf2x16 for each element + Value *Result = PoisonValue::get(ResType); + for (uint64_t i = 0; i < NumElements; i++) { + Value *InVal = CGF.Builder.CreateExtractElement(Op0, i); + Value *Unpack = CGF.Builder.CreateIntrinsic( + UnpackType, Intrinsic::spv_unpackhalf2x16, + ArrayRef<Value *>{InVal}); + Value *Res = CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0); + Result = CGF.Builder.CreateInsertElement(Result, Res, i); + } + return Result; + } + } + + llvm_unreachable("Intrinsic F16ToF32 not supported by target architecture"); +} + static Value *emitBufferStride(CodeGenFunction *CGF, const Expr *HandleExpr, LValue &Stride) { // Figure out the stride of the buffer elements from the handle type. @@ -579,6 +630,9 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getDegreesIntrinsic(), ArrayRef<Value *>{X}, nullptr, "hlsl.degrees"); } + case Builtin::BI__builtin_hlsl_elementwise_f16tof32: { + return handleElementwiseF16ToF32(*this, E); + } case Builtin::BI__builtin_hlsl_elementwise_frac: { Value *Op0 = EmitScalarExpr(E->getArg(0)); if (!E->getArg(0)->getType()->hasFloatingRepresentation()) diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h index a918af39e4074..4c5861c2c5f9d 100644 --- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h @@ -1052,6 +1052,27 @@ float3 exp2(float3); _HLSL_BUILTIN_ALIAS(__builtin_elementwise_exp2) float4 exp2(float4); +//===----------------------------------------------------------------------===// +// f16tof32 builtins +//===----------------------------------------------------------------------===// + +/// \fn float f16tof32(uint x) +/// \brief Returns the half value stored in the low 16 bits of the uint arg +/// converted to a float. +/// \param x The uint containing two half values. +/// +/// The float value of the half value found in the low 16 bits of the \a xi +/// parameter. + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float f16tof32(uint); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float2 f16tof32(uint2); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float3 f16tof32(uint3); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float4 f16tof32(uint4); + //===----------------------------------------------------------------------===// // firstbithigh builtins //===----------------------------------------------------------------------===// diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 94a490a8f68dc..b9707f0036765 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -2802,6 +2802,23 @@ static bool CheckUnsignedIntRepresentation(Sema *S, SourceLocation Loc, return false; } +static bool CheckExpectedBitWidth(Sema *S, CallExpr *TheCall, + unsigned ArgOrdinal, unsigned Width) { + QualType ArgTy = TheCall->getArg(0)->getType(); + if (auto *VTy = ArgTy->getAs<VectorType>()) + ArgTy = VTy->getElementType(); + // ensure arg type has expected bit width + uint64_t ElementBitCount = + S->getASTContext().getTypeSizeInChars(ArgTy).getQuantity() * 8; + if (ElementBitCount != Width) { + S->Diag(TheCall->getArg(0)->getBeginLoc(), + diag::err_integer_incorrect_bit_count) + << Width << ElementBitCount; + return true; + } + return false; +} + static void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall, QualType ReturnType) { auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>(); @@ -2961,24 +2978,16 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { CheckUnsignedIntVecRepresentation)) return true; - auto *VTy = TheCall->getArg(0)->getType()->getAs<VectorType>(); // ensure arg integers are 32-bits - uint64_t ElementBitCount = getASTContext() - .getTypeSizeInChars(VTy->getElementType()) - .getQuantity() * - 8; - if (ElementBitCount != 32) { - SemaRef.Diag(TheCall->getBeginLoc(), - diag::err_integer_incorrect_bit_count) - << 32 << ElementBitCount; + if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32)) return true; - } // ensure both args are vectors of total bit size of a multiple of 64 + auto *VTy = TheCall->getArg(0)->getType()->getAs<VectorType>(); int NumElementsArg = VTy->getNumElements(); if (NumElementsArg != 2 && NumElementsArg != 4) { SemaRef.Diag(TheCall->getBeginLoc(), diag::err_vector_incorrect_bit_count) - << 1 /*a multiple of*/ << 64 << NumElementsArg * ElementBitCount; + << 1 /*a multiple of*/ << 64 << NumElementsArg * 32; return true; } @@ -3295,7 +3304,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { break; } // Note these are llvm builtins that we want to catch invalid intrinsic - // generation. Normal handling of these builitns will occur elsewhere. + // generation. Normal handling of these builtins will occur elsewhere. case Builtin::BI__builtin_elementwise_bitreverse: { // does not include a check for number of arguments // because that is done previously @@ -3405,6 +3414,30 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { } break; } + case Builtin::BI__builtin_hlsl_elementwise_f16tof32: { + if (SemaRef.checkArgCount(TheCall, 1)) + return true; + if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall, + CheckUnsignedIntRepresentation)) + return true; + // ensure arg integers are 32 bits + if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32)) + return true; + // check it wasn't a bool type + QualType ArgTy = TheCall->getArg(0)->getType(); + if (auto *VTy = ArgTy->getAs<VectorType>()) + ArgTy = VTy->getElementType(); + if (ArgTy->isBooleanType()) { + SemaRef.Diag(TheCall->getArg(0)->getBeginLoc(), + diag::err_builtin_invalid_arg_type) + << 1 << /* scalar or vector of */ 5 << /* unsigned int */ 3 + << /* no fp */ 0 << TheCall->getArg(0)->getType(); + return true; + } + + SetElementTypeAsReturnType(&SemaRef, TheCall, getASTContext().FloatTy); + break; + } } return false; } diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl new file mode 100644 index 0000000000000..65dba664bb5ea --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s + +// CHECK: define hidden noundef nofpclass(nan inf) float +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0) +// CHECK: ret float %hlsl.f16tof32 +// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32) +float test_scalar(uint p0) { return __builtin_hlsl_elementwise_f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <2 x float> +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0) +// CHECK: ret <2 x float> %hlsl.f16tof32 +// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>) +float2 test_uint2(uint2 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 { +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0) +// CHECK: ret <3 x float> %hlsl.f16tof32 +// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>) +float3 test_uint3(uint3 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 { +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0) +// CHECK: ret <4 x float> %hlsl.f16tof32 +// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>) +float4 test_uint4(uint4 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); } + + + diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl new file mode 100644 index 0000000000000..b68bc197f16c5 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s + +// CHECK: define hidden noundef nofpclass(nan inf) float +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0) +// CHECK: ret float %hlsl.f16tof32 +// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32) +float test_scalar(uint p0) { return f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <2 x float> +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0) +// CHECK: ret <2 x float> %hlsl.f16tof32 +// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>) +float2 test_uint2(uint2 p0) { return f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 { +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0) +// CHECK: ret <3 x float> %hlsl.f16tof32 +// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>) +float3 test_uint3(uint3 p0) { return f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 { +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0) +// CHECK: ret <4 x float> %hlsl.f16tof32 +// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>) +float4 test_uint4(uint4 p0) { return f16tof32(p0); } + + + diff --git a/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl new file mode 100644 index 0000000000000..8f2f9308ed966 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl @@ -0,0 +1,134 @@ +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify + +float builtin_f16tof32_too_few_arg() { + return __builtin_hlsl_elementwise_f16tof32(); + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} + // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 0 were provided}} +} + +float builtin_f16tof32_too_many_arg(uint p0) { + return __builtin_hlsl_elementwise_f16tof32(p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} + // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 2 were provided}} +} + +float builtin_f16tof32_bool(bool p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}} +} + +float builtin_f16tof32_bool4(bool4 p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool4' (aka 'vector<bool, 4>')}} +} + +float builtin_f16tof32_short(short p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}} +} + +float builtin_f16tof32_unsigned_short(unsigned short p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}} +} + +float builtin_f16tof32_int(int p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}} +} + +float builtin_f16tof32_int64_t(long p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}} +} + +float2 builtin_f16tof32_int2_to_float2_promotion(int2 p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int2' (aka 'vector<int, 2>'))}} +} + +float builtin_f16tof32_half(half p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}} +} + +float builtin_f16tof32_half4(half4 p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half4' (aka 'vector<half, 4>'))}} +} + +float builtin_f16tof32_float(float p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}} +} + +float builtin_f16tof32_double(double p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}} +} + +float f16tof32_too_few_arg() { + return f16tof32(); + // expected-error@-1 {{no matching function for call to 'f16tof32'}} +} + +float f16tof32_too_many_arg(uint p0) { + return f16tof32(p0, p0); + // expected-error@-1 {{no matching function for call to 'f16tof32'}} +} + +float f16tof32_bool(bool p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}} +} + +float f16tof32_bool3(bool3 p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool3' (aka 'vector<bool, 3>'))}} +} + + +float f16tof32_int16_t(short p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}} +} + +float f16tof32_int16_t(unsigned short p0) { + return f16tof32(p0); + // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}} +} + +float f16tof32_int(int p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}} +} + +float f16tof32_int64_t(long p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}} +} + +float2 f16tof32_int2_to_float2_promotion(int3 p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int3' (aka 'vector<int, 3>'))}} +} + +float f16tof32_half(half p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}} +} + +float f16tof32_half2(half2 p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half2' (aka 'vector<half, 2>'))}} +} + +float f16tof32_float(float p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}} +} + +float f16tof32_double(double p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index d6b85630eb979..9924b905aee63 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -140,6 +140,9 @@ def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1 def int_dx_isnan : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyfloat_ty], [IntrNoMem]>; +def int_dx_legacyf16tof32 : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_float_ty>], + [llvm_anyint_ty], [IntrNoMem]>; + def int_dx_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], [IntrNoMem]>; diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index bc51fb639fd75..f39c6cda2c579 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -200,4 +200,7 @@ def int_spv_resource_nonuniformindex def int_spv_generic_cast_to_ptr_explicit : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [generic_ptr_ty], [IntrNoMem, NoUndef<RetIndex>]>; + + def int_spv_unpackhalf2x16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i32_ty], [IntrNoMem]>; + } diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 7ae500a55b92d..67437f6969b27 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1079,6 +1079,15 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> { let attributes = [Attributes<DXIL1_0, []>]; } +def LegacyF16ToF32 : DXILOp<131, legacyF16ToF32> { + let Doc = "returns the float16 stored in the low-half of the uint converted " + "to a float"; + let intrinsics = [IntrinSelect<int_dx_legacyf16tof32>]; + let arguments = [Int32Ty]; + let result = FloatTy; + let stages = [Stages<DXIL1_0, [all_stages]>]; +} + def WaveAllBitCount : DXILOp<135, waveAllOp> { let Doc = "returns the count of bits set to 1 across the wave"; let intrinsics = [IntrinSelect<int_dx_wave_active_countbits>]; diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 60dfd9650937c..6cacbf6564db2 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -29,11 +29,12 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const { switch (ID) { case Intrinsic::dx_asdouble: - case Intrinsic::dx_isinf: - case Intrinsic::dx_isnan: case Intrinsic::dx_firstbitlow: - case Intrinsic::dx_firstbituhigh: case Intrinsic::dx_firstbitshigh: + case Intrinsic::dx_firstbituhigh: + case Intrinsic::dx_isinf: + case Intrinsic::dx_isnan: + case Intrinsic::dx_legacyf16tof32: return OpdIdx == 0; default: return OpdIdx == -1; @@ -50,6 +51,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_frac: case Intrinsic::dx_isinf: case Intrinsic::dx_isnan: + case Intrinsic::dx_legacyf16tof32: case Intrinsic::dx_rsqrt: case Intrinsic::dx_saturate: case Intrinsic::dx_splitdouble: diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 3f0424f436c72..245e5a2894604 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3516,6 +3516,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, case Intrinsic::spv_resource_nonuniformindex: { return selectResourceNonUniformIndex(ResVReg, ResType, I); } + case Intrinsic::spv_unpackhalf2x16: { + return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16); + } + default: { std::string DiagMsg; raw_string_ostream OS(DiagMsg); diff --git a/llvm/test/CodeGen/DirectX/f16tof32.ll b/llvm/test/CodeGen/DirectX/f16tof32.ll new file mode 100644 index 0000000000000..edc5c1942e8bd --- /dev/null +++ b/llvm/test/CodeGen/DirectX/f16tof32.ll @@ -0,0 +1,57 @@ +; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.9-library %s | FileCheck %s + +define hidden noundef nofpclass(nan inf) float @_Z11test_scalarj(i32 noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK : [[UINT:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 %p0) + ; CHECK : ret float [[UINT]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %p0) + ret float %hlsl.f16tof32 +} + +define hidden noundef nofpclass(nan inf) <2 x float> @_Z10test_uint2Dv2_j(<2 x i32> noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK: [[UINT2_0:%.*]] = extractelement <2 x i32> %p0, i64 0 + ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_0]]) + ; CHECK: [[UINT2_1:%.*]] = extractelement <2 x i32> %p0, i64 1 + ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_1]]) + ; CHECK: [[FLOAT2_0:%.*]] = insertelement <2 x float> poison, float [[FLOAT_0]], i64 0 + ; CHECK: [[FLOAT2_1:%.*]] = insertelement <2 x float> [[FLOAT2_0]], float [[FLOAT_1]], i64 1 + ; CHECK : ret <2 x float> [[FLOAT2_1]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %p0) + ret <2 x float> %hlsl.f16tof32 +} + +define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK: [[UINT3_0:%.*]] = extractelement <3 x i32> %p0, i64 0 + ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_0]]) + ; CHECK: [[UINT3_1:%.*]] = extractelement <3 x i32> %p0, i64 1 + ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_1]]) + ; CHECK: [[UINT3_2:%.*]] = extractelement <3 x i32> %p0, i64 2 + ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_2]]) + ; CHECK: [[FLOAT3_0:%.*]] = insertelement <3 x float> poison, float [[FLOAT_0]], i64 0 + ; CHECK: [[FLOAT3_1:%.*]] = insertelement <3 x float> [[FLOAT3_0]], float [[FLOAT_1]], i64 1 + ; CHECK: [[FLOAT3_2:%.*]] = insertelement <3 x float> [[FLOAT3_1]], float [[FLOAT_2]], i64 2 + ; CHECK : ret <3 x float> [[FLOAT3_2]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %p0) + ret <3 x float> %hlsl.f16tof32 +} + +define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK: [[UINT4_0:%.*]] = extractelement <4 x i32> %p0, i64 0 + ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_0]]) + ; CHECK: [[UINT4_1:%.*]] = extractelement <4 x i32> %p0, i64 1 + ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_1]]) + ; CHECK: [[UINT4_2:%.*]] = extractelement <4 x i32> %p0, i64 2 + ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_2]]) + ; CHECK: [[UINT4_3:%.*]] = extractelement <4 x i32> %p0, i64 3 + ; CHECK: [[FLOAT_3:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_3]]) + ; CHECK: [[FLOAT4_0:%.*]] = insertelement <4 x float> poison, float [[FLOAT_0]], i64 0 + ; CHECK: [[FLOAT4_1:%.*]] = insertelement <4 x float> [[FLOAT4_0]], float [[FLOAT_1]], i64 1 + ; CHECK: [[FLOAT4_2:%.*]] = insertelement <4 x float> [[FLOAT4_1]], float [[FLOAT_2]], i64 2 + ; CHECK: [[FLOAT4_3:%.*]] = insertelement <4 x float> [[FLOAT4_2]], float [[FLOAT_3]], i64 3 + ; CHECK : ret <4 x float> [[FLOAT4_3]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %p0) + ret <4 x float> %hlsl.f16tof32 +} diff --git a/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll new file mode 100644 index 0000000000000..6a9ce4515f5c0 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll @@ -0,0 +1,18 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: [[SET:%.*]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: [[UINT:%.*]] = OpTypeInt 32 0 +; CHECK-DAG: [[FLOAT:%.*]] = OpTypeFloat 32 +; CHECK-DAG: [[FLOAT2:%.*]] = OpTypeVector [[FLOAT]] 2 + +; CHECK: [[P0:%.*]] = OpFunctionParameter [[UINT]] +; CHECK: [[UNPACK2:%.*]] = OpExtInst [[FLOAT2]] [[SET]] UnpackHalf2x16 [[P0]] +; CHECK: [[UNPACK:%.*]] = OpCompositeExtract [[FLOAT]] [[UNPACK2]] 0 +; CHECK: OpReturnValue [[UNPACK]] +define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 { + %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0) + %3 = extractelement <2 x float> %2, i64 0 + ret float %3 +} + From 83d1599871b2bf7cf8a093c272367a141767a895 Mon Sep 17 00:00:00 2001 From: John Harrison <harjohn@google.com> Date: Tue, 4 Nov 2025 09:04:56 -0800 Subject: [PATCH 190/313] [lldb-dap] Addressing orphaned processes in tests. (#166205) In lldb-dap tests, we sometimes spawn subprocesses directly but do not always correctly clean them up. This can cause some tests, like the `TestDAP_disconnect.test_attach` to hang and not properly respect timeouts. To fix this, I am passing the `lldbtest.Base.spawnSubprocess` helper to the adapter client so it can be used spawn subprocesses in a way that we can ensure they're cleaned up. --- .../test/tools/lldb-dap/dap_server.py | 75 ++++++++++--------- .../test/tools/lldb-dap/lldbdap_testcase.py | 1 + .../lldb-dap/disconnect/TestDAP_disconnect.py | 9 +-- .../tools/lldb-dap/server/TestDAP_server.py | 3 +- 4 files changed, 45 insertions(+), 43 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index d892c01f0bc71..ac550962cfb85 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -32,6 +32,10 @@ # timeout by a factor of 10 if ASAN is enabled. DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1) +# See lldbtest.Base.spawnSubprocess, which should help ensure any processes +# created by the DAP client are terminated correctly when the test ends. +SpawnHelperCallback = Callable[[str, List[str], List[str]], subprocess.Popen] + ## DAP type references @@ -191,14 +195,16 @@ def __init__( self, recv: BinaryIO, send: BinaryIO, - init_commands: list[str], - log_file: Optional[TextIO] = None, + init_commands: Optional[List[str]] = None, + log_file: Optional[str] = None, + spawn_helper: Optional[SpawnHelperCallback] = None, ): # For debugging test failures, try setting `trace_file = sys.stderr`. self.trace_file: Optional[TextIO] = None self.log_file = log_file self.send = send self.recv = recv + self.spawn_helper = spawn_helper # Packets that have been received and processed but have not yet been # requested by a test case. @@ -211,7 +217,7 @@ def __init__( self._recv_thread = threading.Thread(target=self._read_packet_thread) # session state - self.init_commands = init_commands + self.init_commands = init_commands if init_commands else [] self.exit_status: Optional[int] = None self.capabilities: Dict = {} self.initialized: bool = False @@ -310,11 +316,6 @@ def collect_output( output += self.get_output(category, clear=clear) return output - def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]): - with self.recv_condition: - self.recv_packets.append(packet) - self.recv_condition.notify() - def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool: """Handles an incoming packet. @@ -460,22 +461,11 @@ def _handle_reverse_request(self, request: Request) -> None: self.reverse_requests.append(request) arguments = request.get("arguments") if request["command"] == "runInTerminal" and arguments is not None: - in_shell = arguments.get("argsCanBeInterpretedByShell", False) - print("spawning...", arguments["args"]) - proc = subprocess.Popen( - arguments["args"], - env=arguments.get("env", {}), - cwd=arguments.get("cwd", None), - stdin=subprocess.DEVNULL, - stdout=sys.stderr, - stderr=sys.stderr, - shell=in_shell, - ) - body = {} - if in_shell: - body["shellProcessId"] = proc.pid - else: - body["processId"] = proc.pid + assert self.spawn_helper is not None, "Not configured to spawn subprocesses" + [exe, *args] = arguments["args"] + env = [f"{k}={v}" for k, v in arguments.get("env", {}).items()] + proc = self.spawn_helper(exe, args, env) + body = {"processId": proc.pid} self.send_packet( { "type": "response", @@ -1501,12 +1491,14 @@ def request_setInstructionBreakpoints(self, memory_reference=[]): class DebugAdapterServer(DebugCommunication): def __init__( self, + *, executable: Optional[str] = None, connection: Optional[str] = None, - init_commands: list[str] = [], - log_file: Optional[TextIO] = None, - env: Optional[dict[str, str]] = None, - additional_args: list[str] = [], + init_commands: Optional[list[str]] = None, + log_file: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + additional_args: Optional[List[str]] = None, + spawn_helper: Optional[SpawnHelperCallback] = None, ): self.process = None self.connection = None @@ -1532,13 +1524,21 @@ def __init__( s = socket.create_connection((host.strip("[]"), int(port))) else: raise ValueError("invalid connection: {}".format(connection)) - DebugCommunication.__init__( - self, s.makefile("rb"), s.makefile("wb"), init_commands, log_file + super().__init__( + s.makefile("rb"), + s.makefile("wb"), + init_commands, + log_file, + spawn_helper, ) self.connection = connection else: - DebugCommunication.__init__( - self, self.process.stdout, self.process.stdin, init_commands, log_file + super().__init__( + self.process.stdout, + self.process.stdin, + init_commands, + log_file, + spawn_helper, ) @classmethod @@ -1546,14 +1546,14 @@ def launch( cls, *, executable: str, - env: Optional[dict[str, str]] = None, - log_file: Optional[TextIO] = None, + env: Optional[Dict[str, str]] = None, + log_file: Optional[str] = None, connection: Optional[str] = None, connection_timeout: Optional[int] = None, - additional_args: list[str] = [], + additional_args: Optional[List[str]] = None, ) -> tuple[subprocess.Popen, Optional[str]]: adapter_env = os.environ.copy() - if env is not None: + if env: adapter_env.update(env) if log_file: @@ -1561,7 +1561,8 @@ def launch( args = [executable] # Add additional arguments first (like --no-lldbinit) - args.extend(additional_args) + if additional_args: + args.extend(additional_args) if connection is not None: args.append("--connection") diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index c6c4a3e2a4e1e..71ca60ebe8d34 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -39,6 +39,7 @@ def create_debug_adapter( log_file=log_file_path, env=lldbDAPEnv, additional_args=additional_args or [], + spawn_helper=self.spawnSubprocess, ) def build_and_create_debug_adapter( diff --git a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py index 09e3f62f0eead..19f88d88c2ff4 100644 --- a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py +++ b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py @@ -3,17 +3,15 @@ """ -import dap_server from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil import lldbdap_testcase -import subprocess import time import os -class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase): +class TestDAP_disconnect(lldbdap_testcase.DAPTestCaseBase): source = "main.cpp" def disconnect_and_assert_no_output_printed(self): @@ -67,10 +65,11 @@ def test_attach(self): lambda: self.run_platform_command("rm %s" % (sync_file_path)) ) - self.process = subprocess.Popen([program, sync_file_path]) + proc = self.spawnSubprocess(program, [sync_file_path]) lldbutil.wait_for_file_on_target(self, sync_file_path) - self.attach(pid=self.process.pid, disconnectAutomatically=False) + self.attach(pid=proc.pid, disconnectAutomatically=False, stopOnEntry=True) + self.continue_to_next_stop() response = self.dap_server.request_evaluate("wait_for_attach = false;") self.assertTrue(response["success"]) diff --git a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py index 12b321cf42778..3c53cf2ed3460 100644 --- a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py +++ b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py @@ -37,7 +37,7 @@ def cleanup(): def run_debug_session(self, connection, name, sleep_seconds_in_middle=None): self.dap_server = dap_server.DebugAdapterServer( - connection=connection, + connection=connection, spawn_helper=self.spawnSubprocess ) program = self.getBuildArtifact("a.out") source = "main.c" @@ -94,6 +94,7 @@ def test_server_interrupt(self): (process, connection) = self.start_server(connection="listen://localhost:0") self.dap_server = dap_server.DebugAdapterServer( connection=connection, + spawn_helper=self.spawnSubprocess, ) program = self.getBuildArtifact("a.out") source = "main.c" From af9a4263a1a209953a1d339ef781a954e31268ff Mon Sep 17 00:00:00 2001 From: Florian Hahn <flo@fhahn.com> Date: Tue, 4 Nov 2025 17:08:12 +0000 Subject: [PATCH 191/313] [LAA] Only use inbounds/nusw in isNoWrap if the GEP is dereferenced. (#161445) Update isNoWrap to only use the inbounds/nusw flags from GEPs that are guaranteed to be dereferenced on every iteration. This fixes a case where we incorrectly determine no dependence. I think the issue is isolated to code that evaluates the resulting AddRec at BTC, just using it to compute the distance between accesses should still be fine; if the access does not execute in a given iteration, there's no dependence in that iteration. But isolating the code is not straight-forward, so be conservative for now. The practical impact should be very minor (only one loop changed across a corpus with 27k modules from large C/C++ workloads. Fixes https://github.com/llvm/llvm-project/issues/160912. PR: https://github.com/llvm/llvm-project/pull/161445 --- .../llvm/Analysis/LoopAccessAnalysis.h | 2 +- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 44 ++++++--- llvm/lib/Analysis/VectorUtils.cpp | 11 ++- .../AArch64/AArch64TargetTransformInfo.cpp | 10 +- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 9 +- .../Transforms/Scalar/LoopLoadElimination.cpp | 16 ++-- .../Vectorize/LoopVectorizationLegality.cpp | 5 +- .../inbounds-gep-in-predicated-blocks.ll | 17 +++- .../RISCV/masked_gather_scatter.ll | 40 +++++--- .../x86-interleaved-accesses-masked-group.ll | 91 +++++++++++++++++-- 10 files changed, 188 insertions(+), 57 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 84b4ad7c1d5a9..c85ef3e131068 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -893,7 +893,7 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, /// result of this function is undefined. LLVM_ABI std::optional<int64_t> getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, - const Loop *Lp, + const Loop *Lp, const DominatorTree &DT, const DenseMap<Value *, const SCEV *> &StridesMap = DenseMap<Value *, const SCEV *>(), bool Assume = false, bool ShouldCheckWrap = true); diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index e27a9b1c44014..5d88e5f54e3d6 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -806,11 +806,11 @@ class AccessAnalysis { typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI, - MemoryDepChecker::DepCandidates &DA, + DominatorTree &DT, MemoryDepChecker::DepCandidates &DA, PredicatedScalarEvolution &PSE, SmallPtrSetImpl<MDNode *> &LoopAliasScopes) - : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), - LoopAliasScopes(LoopAliasScopes) { + : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA), + PSE(PSE), LoopAliasScopes(LoopAliasScopes) { // We're analyzing dependences across loop iterations. BAA.enableCrossIterationMode(); } @@ -934,6 +934,9 @@ class AccessAnalysis { /// The LoopInfo of the loop being checked. const LoopInfo *LI; + /// The dominator tree of the function. + DominatorTree &DT; + /// Sets of potentially dependent accesses - members of one set share an /// underlying pointer. The set "CheckDeps" identfies which sets really need a /// dependence check. @@ -1015,6 +1018,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, /// informating from the IR pointer value to determine no-wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, + const DominatorTree &DT, std::optional<int64_t> Stride = std::nullopt) { // FIXME: This should probably only return true for NUW. if (AR->getNoWrapFlags(SCEV::NoWrapMask)) @@ -1029,8 +1033,18 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, // case, the GEP would be poison and any memory access dependent on it would // be immediate UB when executed. if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr); - GEP && GEP->hasNoUnsignedSignedWrap()) - return true; + GEP && GEP->hasNoUnsignedSignedWrap()) { + // For the above reasoning to apply, the pointer must be dereferenced in + // every iteration. + if (L->getHeader() == L->getLoopLatch() || + any_of(GEP->users(), [L, &DT, GEP](User *U) { + if (getLoadStorePointerOperand(U) != GEP) + return false; + BasicBlock *UserBB = cast<Instruction>(U)->getParent(); + return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT); + })) + return true; + } if (!Stride) Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE); @@ -1293,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess( } if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy, - TheLoop, Assume)) + TheLoop, Assume, DT)) return false; } @@ -1606,7 +1620,7 @@ void AccessAnalysis::processMemAccesses() { /// Check whether the access through \p Ptr has a constant stride. std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, - const Loop *Lp, + const Loop *Lp, const DominatorTree &DT, const DenseMap<Value *, const SCEV *> &StridesMap, bool Assume, bool ShouldCheckWrap) { const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); @@ -1630,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, if (!ShouldCheckWrap || !Stride) return Stride; - if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride)) + if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride)) return Stride; LLVM_DEBUG( @@ -2047,10 +2061,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( BPtr->getType()->getPointerAddressSpace()) return MemoryDepChecker::Dependence::Unknown; - std::optional<int64_t> StrideAPtr = - getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true); - std::optional<int64_t> StrideBPtr = - getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true); + std::optional<int64_t> StrideAPtr = getPtrStride( + PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true); + std::optional<int64_t> StrideBPtr = getPtrStride( + PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true); const SCEV *Src = PSE.getSCEV(APtr); const SCEV *Sink = PSE.getSCEV(BPtr); @@ -2627,7 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, } MemoryDepChecker::DepCandidates DepCands; - AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes); + AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE, + LoopAliasScopes); // Holds the analyzed pointers. We don't want to call getUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -2691,7 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, bool IsReadOnlyPtr = false; Type *AccessTy = getLoadStoreType(LD); if (Seen.insert({Ptr, AccessTy}).second || - !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) { + !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false, + true)) { ++NumReads; IsReadOnlyPtr = true; } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 091d94843698c..977ed59e09243 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1387,9 +1387,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // wrap around the address space we would do a memory access at nullptr // even without the transformation. The wrapping checks are therefore // deferred until after we've formed the interleaved groups. - int64_t Stride = - getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides, - /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0); + int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) + .value_or(0); const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, @@ -1643,8 +1643,9 @@ void InterleavedAccessInfo::analyzeInterleaving( assert(Member && "Group member does not exist"); Value *MemberPtr = getLoadStorePointerOperand(Member); Type *AccessTy = getLoadStoreType(Member); - if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides, - /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0)) + if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides, + /*Assume=*/false, /*ShouldCheckWrap=*/true) + .value_or(0)) return false; LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " << FirstOrLast diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 10f2c80edc1b3..197aae6e03cb1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -6207,7 +6207,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } static bool containsDecreasingPointers(Loop *TheLoop, - PredicatedScalarEvolution *PSE) { + PredicatedScalarEvolution *PSE, + const DominatorTree &DT) { const auto &Strides = DenseMap<Value *, const SCEV *>(); for (BasicBlock *BB : TheLoop->blocks()) { // Scan the instructions in the block and look for addresses that are @@ -6216,8 +6217,8 @@ static bool containsDecreasingPointers(Loop *TheLoop, if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, - /*ShouldCheckWrap=*/false) + if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) .value_or(0) < 0) return true; } @@ -6262,7 +6263,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { // negative strides. This will require extra work to reverse the loop // predicate, which may be expensive. if (containsDecreasingPointers(TFI->LVL->getLoop(), - TFI->LVL->getPredicatedScalarEvolution())) + TFI->LVL->getPredicatedScalarEvolution(), + *TFI->LVL->getDominatorTree())) Required |= TailFoldingOpts::Reverse; if (Required == TailFoldingOpts::Disabled) Required |= TailFoldingOpts::Simple; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 9b250e6cac3ab..24f58a68c345d 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { // static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, - const LoopAccessInfo *LAI) { + const LoopAccessInfo *LAI, + const DominatorTree &DT) { LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); // If there are live-out values, it is probably a reduction. We can predicate @@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, if (isa<StoreInst>(I) || isa<LoadInst>(I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0); + int64_t NextStride = + getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0); if (NextStride == 1) { // TODO: for now only allow consecutive strides of 1. We could support // other strides as long as it is uniform, but let's keep it simple @@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; } - return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI()); + return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(), + *LVL->getDominatorTree()); } TailFoldingStyle diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index a8839981e5478..1b770be3909a9 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate { /// Return true if the dependence from the store to the load has an /// absolute distance of one. /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop) - bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, - Loop *L) const { + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L, + const DominatorTree &DT) const { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadType = getLoadStoreType(Load); @@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate { DL.getTypeSizeInBits(getLoadStoreType(Store)) && "Should be a known dependence"); - int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0); - int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0); + int64_t StrideLoad = + getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0); + int64_t StrideStore = + getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0); if (!StrideLoad || !StrideStore || StrideLoad != StrideStore) return false; @@ -287,8 +289,8 @@ class LoadEliminationForLoop { // so deciding which one forwards is easy. The later one forwards as // long as they both have a dependence distance of one to the load. if (Cand.Store->getParent() == OtherCand->Store->getParent() && - Cand.isDependenceDistanceOfOne(PSE, L) && - OtherCand->isDependenceDistanceOfOne(PSE, L)) { + Cand.isDependenceDistanceOfOne(PSE, L, *DT) && + OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) { // They are in the same block, the later one will forward to the load. if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) OtherCand = &Cand; @@ -538,7 +540,7 @@ class LoadEliminationForLoop { // Check whether the SCEV difference is the same as the induction step, // thus we load the value in the next iteration. - if (!Cand.isDependenceDistanceOfOne(PSE, L)) + if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT)) continue; assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) && diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index fdfff16132093..03112c67dda7b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, bool CanAddPredicate = !llvm::shouldOptimizeForSize( TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); - int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides, - CanAddPredicate, false).value_or(0); + int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides, + CanAddPredicate, false) + .value_or(0); if (Stride == 1 || Stride == -1) return Stride; return 0; diff --git a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll index 4c2a9c3f29f02..d90a97f1651e6 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll @@ -10,7 +10,7 @@ ; s0 += (1ULL << 62) + 1; ; s1 += (1ULL << 62) + 2; ; } -; FIXME: We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine +; We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine ; there are no dependences), as the pointers are not dereferenced in all loop iterations. define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) { ; CHECK-LABEL: 'test_inbounds_gep_used_in_predicated_block' @@ -19,9 +19,14 @@ define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) { ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A)) +; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header> +; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw> ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: ; @@ -63,9 +68,14 @@ define void @test_inbounds_gep_used_in_predicated_block_stored_value_operand(ptr ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A)) +; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header> +; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were found in loop. ; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw> ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: ; @@ -109,9 +119,14 @@ define void @test_inbounds_gep_used_in_predicated_block_non_memop_user(ptr %A, i ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A)) +; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header> +; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw> ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index 89819f2be4967..1cbec47d72203 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -17,18 +17,33 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-LABEL: @foo4( ; RV32-NEXT: entry: ; RV32-NEXT: br label [[VECTOR_MEMCHECK:%.*]] +; RV32: vector.scevcheck: +; RV32-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 128, i32 624) +; RV32-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV32-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV32-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[MUL_RESULT]] +; RV32-NEXT: [[TMP1:%.*]] = icmp ult ptr [[TMP0]], [[A]] +; RV32-NEXT: [[TMP2:%.*]] = or i1 [[TMP1]], [[MUL_OVERFLOW]] +; RV32-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 256, i32 624) +; RV32-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 +; RV32-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 +; RV32-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[MUL_RESULT2]] +; RV32-NEXT: [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[B]] +; RV32-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]] +; RV32-NEXT: [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]] +; RV32-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK1:%.*]] ; RV32: vector.memcheck: -; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880 ; RV32-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940 -; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752 -; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] -; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i32 79880 +; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i32 159752 +; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] ; RV32-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; RV32-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] ; RV32-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; RV32-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV32: vector.ph: ; RV32-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64() ; RV32-NEXT: [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16) @@ -43,25 +58,26 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0 ; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]] -; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; RV32-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100) ; RV32-NEXT: [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 1) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]] -; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]] ; RV32-NEXT: [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double> ; RV32-NEXT: [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]] -; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]] ; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; RV32-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]] ; RV32-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; RV32: middle.block: ; RV32-NEXT: br label [[FOR_END:%.*]] ; RV32: scalar.ph: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ] ; RV32-NEXT: br label [[FOR_BODY:%.*]] ; RV32: for.body: -; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] ; RV32-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100 @@ -78,7 +94,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32: for.inc: ; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV32-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] +; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] ; RV32: for.end: ; RV32-NEXT: ret void ; @@ -146,7 +162,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64: for.inc: ; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV64-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] +; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] ; RV64: for.end: ; RV64-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index efcc0005acaa3..f9570405ecabc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -711,17 +711,92 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE16:%.*]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul i32 [[INDEX]], 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false> -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr align 1 [[TMP3]], <24 x i1> [[TMP5]], <24 x i8> poison) -; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nsw <8 x i32> [[VEC_IND]], splat (i32 3) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP51]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = load i8, ptr [[TMP52]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP53]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; ENABLED_MASKED_STRIDED: pred.load.continue: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP4]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if3: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; ENABLED_MASKED_STRIDED: pred.load.continue4: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP4]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if5: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; ENABLED_MASKED_STRIDED: pred.load.continue6: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP4]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if7: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; ENABLED_MASKED_STRIDED: pred.load.continue8: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP4]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if9: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; ENABLED_MASKED_STRIDED: pred.load.continue10: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP4]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if11: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; ENABLED_MASKED_STRIDED: pred.load.continue12: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP4]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if13: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; ENABLED_MASKED_STRIDED: pred.load.continue14: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP4]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]] +; ENABLED_MASKED_STRIDED: pred.load.if15: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; ENABLED_MASKED_STRIDED: pred.load.continue16: +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 From 37825ad4f6c5d7477fa1f8ed8b00aec186a8ceb9 Mon Sep 17 00:00:00 2001 From: Ivan Kelarev <ivan.kelarev@intel.com> Date: Tue, 4 Nov 2025 09:20:01 -0800 Subject: [PATCH 192/313] [LoopUnroll] Prevent LoopFullUnrollPass from performing partial unrolling when trip counts are unknown (#165013) Currently, `LoopFullUnrollPass` incorrectly performs partial unrolling when `#pragma unroll` is specified and both `TripCount` and `MaxTripCount` are unknown. This patch adds a check to prevent partial unrolling when `OnlyFullUnroll` parameter is true and both trip count values are zero. --- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 3 +- .../LoopUnroll/full-unroll-avoid-partial.ll | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 2bda9d83236e8..802ae4e9c28e3 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, } // Do not attempt partial/runtime unrolling in FullLoopUnrolling - if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) { + if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) || + UP.Count < TripCount || UP.Count < MaxTripCount)) { LLVM_DEBUG( dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll index 7f266a754d1bc..314cf38baae04 100644 --- a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll +++ b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll @@ -85,6 +85,35 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !3 } +; LOOP-UNROLL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body +; LOOP-UNROLL-NEXT: Loop Size = 4 +; LOOP-UNROLL-NEXT: Exiting block %for.body: TripCount=0, TripMultiple=1, BreakoutTrip=1 +; LOOP-UNROLL-NEXT: Trying runtime unrolling on Loop: +; LOOP-UNROLL-NEXT: Loop at depth 1 containing: %for.body<header><exiting>,%for.cond<latch> +; LOOP-UNROLL-NEXT: Using epilog remainder. +; LOOP-UNROLL-NEXT: Loop latch not terminated by a conditional branch. +; LOOP-UNROLL-NEXT: UNROLLING loop %for.body by 5! + +; LOOP-UNROLL-FULL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body +; LOOP-UNROLL-FULL-NEXT: Loop Size = 4 +; LOOP-UNROLL-FULL-NEXT: Not attempting partial/runtime unroll in FullLoopUnroll +define void @pragma_unroll_count2(i64 %n) { +entry: + br label %for.body + +for.body: ; preds = %for.cond, %entry + %i = phi i64 [ 0, %entry ], [ %inc, %for.cond ] + %cmp = icmp ult i64 %i, %n + br i1 %cmp, label %for.cond, label %for.cond.cleanup + +for.cond: ; preds = %for.body + %inc = add i64 %i, 8 + br label %for.body, !llvm.loop !3 + +for.cond.cleanup: ; preds = %for.body + ret void +} + ; LOOP-UNROLL: llvm.loop.unroll.disable ; LOOP-UNROLL-FULL: llvm.loop.unroll.enable !0 = !{!"llvm.loop.unroll.enable"} From 67ce4aba26172cda8fed90077e3393e98c22d2d2 Mon Sep 17 00:00:00 2001 From: Paul Kirth <paulkirth@google.com> Date: Tue, 4 Nov 2025 09:24:27 -0800 Subject: [PATCH 193/313] [llvm][mustache] Use single pass when tokenizing (#159196) The old implementation used many string searches over the same portions of the strings. This version sacrifices some API niceness for perf wins. | Metric | Baseline | Single-Pass | Change | | --- | --- | --- | --- | | Time (ms) | 36\.09 | 35\.78 | \-0.86% | | Cycles | 35\.3M | 35\.0M | \-0.79% | | Instructions | 86\.7M | 85\.8M | \-1.03% | | Branch Misses | 116K | 114K | \-1.91% | | Cache Misses | 244K | 232K | \-4.98% | --- llvm/lib/Support/Mustache.cpp | 184 +++++++++++++--------------------- 1 file changed, 71 insertions(+), 113 deletions(-) diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 708e79d39cd21..24e3105c5e8a9 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -368,141 +368,99 @@ struct Tag { llvm_unreachable("Unknown json::Value::Kind"); } -static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open, - StringRef Close) { - const StringLiteral TripleOpen("{{{"); - const StringLiteral TripleClose("}}}"); - - size_t NormalOpenPos = Template.find(Open, StartPos); - size_t TripleOpenPos = Template.find(TripleOpen, StartPos); - - Tag Result; - - // Determine which tag comes first. - if (TripleOpenPos != StringRef::npos && - (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) { - // Found a triple mustache tag. - size_t EndPos = - Template.find(TripleClose, TripleOpenPos + TripleOpen.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Triple; - Result.StartPosition = TripleOpenPos; - size_t ContentStart = TripleOpenPos + TripleOpen.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = Template.substr( - TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos); - } else if (NormalOpenPos != StringRef::npos) { - // Found a normal mustache tag. - size_t EndPos = Template.find(Close, NormalOpenPos + Open.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Normal; - Result.StartPosition = NormalOpenPos; - size_t ContentStart = NormalOpenPos + Open.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = - Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos); - } - - return Result; -} - -static std::optional<std::pair<StringRef, StringRef>> -processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) { - LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content - << ", Kind: " << tagKindToString(T.TagKind) << "\n"); - if (T.TagKind == Tag::Kind::Triple) { - Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx); - return std::nullopt; - } - StringRef Interpolated = T.Content; - if (!Interpolated.trim().starts_with("=")) { - char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); - Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx); - return std::nullopt; - } - Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx); - StringRef DelimSpec = Interpolated.trim(); - DelimSpec = DelimSpec.drop_front(1); - DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); - DelimSpec = DelimSpec.trim(); - - std::pair<StringRef, StringRef> Ret = DelimSpec.split(' '); - LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first - << ", NewClose: " << Ret.second << "\n"); - return Ret; -} - // Simple tokenizer that splits the template into tokens. -// The mustache spec allows {{{ }}} to unescape variables, -// but we don't support that here. An unescape variable -// is represented only by {{& variable}}. static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) { LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n"); SmallVector<Token> Tokens; SmallString<8> Open("{{"); SmallString<8> Close("}}"); - size_t Start = 0; + size_t Cursor = 0; + size_t TextStart = 0; - while (Start < Template.size()) { - LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open - << "', Close:'" << Close << "'\n"); - Tag T = findNextTag(Template, Start, Open, Close); + const StringLiteral TripleOpen("{{{"); + const StringLiteral TripleClose("}}}"); - if (T.TagKind == Tag::Kind::None) { - // No more tags, the rest is text. - Tokens.emplace_back(Template.substr(Start)); - break; + while (Cursor < Template.size()) { + StringRef TemplateSuffix = Template.substr(Cursor); + StringRef TagOpen, TagClose; + Tag::Kind Kind; + + // Determine which tag we've encountered. + if (TemplateSuffix.starts_with(TripleOpen)) { + Kind = Tag::Kind::Triple; + TagOpen = TripleOpen; + TagClose = TripleClose; + } else if (TemplateSuffix.starts_with(Open)) { + Kind = Tag::Kind::Normal; + TagOpen = Open; + TagClose = Close; + } else { + // Not at a tag, continue scanning. + ++Cursor; + continue; } - // Add the text before the tag. - if (T.StartPosition > Start) { - StringRef Text = Template.substr(Start, T.StartPosition - Start); - Tokens.emplace_back(Text); + // Found a tag, first add the preceding text. + if (Cursor > TextStart) + Tokens.emplace_back(Template.slice(TextStart, Cursor)); + + // Find the closing tag. + size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size()); + if (EndPos == StringRef::npos) { + // No closing tag, the rest is text. + Tokens.emplace_back(Template.substr(Cursor)); + TextStart = Cursor = Template.size(); + break; } - if (auto NewDelims = processTag(T, Tokens, Ctx)) { - std::tie(Open, Close) = *NewDelims; + // Extract tag content and full match. + size_t ContentStart = Cursor + TagOpen.size(); + StringRef Content = Template.substr(ContentStart, EndPos - ContentStart); + StringRef FullMatch = + Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor); + + // Process the tag (inlined logic from processTag). + LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content + << ", Kind: " << tagKindToString(Kind) << "\n"); + if (Kind == Tag::Kind::Triple) { + Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx); + } else { // Normal Tag + StringRef Interpolated = Content; + if (!Interpolated.trim().starts_with("=")) { + char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); + Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx); + } else { // Set Delimiter + Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx); + StringRef DelimSpec = Interpolated.trim(); + DelimSpec = DelimSpec.drop_front(1); + DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); + DelimSpec = DelimSpec.trim(); + + auto [NewOpen, NewClose] = DelimSpec.split(' '); + LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen + << ", NewClose: " << NewClose << "\n"); + Open = NewOpen; + Close = NewClose; + } } - // Move past the tag. - Start = T.StartPosition + T.FullMatch.size(); + // Move past the tag for the next iteration. + Cursor += FullMatch.size(); + TextStart = Cursor; } - // Fix up white spaces for: - // - open sections - // - inverted sections - // - close sections - // - comments - // - // This loop attempts to find standalone tokens and tries to trim out - // the surrounding whitespace. - // For example: - // if you have the template string - // {{#section}} \n Example \n{{/section}} - // The output should would be - // For example: - // \n Example \n + // Add any remaining text after the last tag. + if (TextStart < Template.size()) + Tokens.emplace_back(Template.substr(TextStart)); + + // Fix up white spaces for standalone tags. size_t LastIdx = Tokens.size() - 1; for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) { Token &CurrentToken = Tokens[Idx]; Token::Type CurrentType = CurrentToken.getType(); - // Check if token type requires cleanup. - bool RequiresCleanUp = requiresCleanUp(CurrentType); - - if (!RequiresCleanUp) + if (!requiresCleanUp(CurrentType)) continue; - // We adjust the token body if there's no text behind or ahead. - // A token is considered to have no text ahead if the right of the previous - // token is a newline followed by spaces. - // A token is considered to have no text behind if the left of the next - // token is spaces followed by a newline. - // eg. - // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3" bool HasTextBehind = hasTextBehind(Idx, Tokens); bool HasTextAhead = hasTextAhead(Idx, Tokens); From 0ce03c2be4c43d19e4b63d805b13838f56621f3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com> Date: Tue, 4 Nov 2025 07:33:09 -1000 Subject: [PATCH 194/313] [flang][cuda] Add interface and lowering for atomicadd_r4x2 and atomicadd_r4x4 (#166308) --- .../flang/Optimizer/Builder/IntrinsicCall.h | 1 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 71 +++++++++++-------- flang/module/cudadevice.f90 | 20 +++++- flang/test/Lower/CUDA/cuda-atomicadd.cuf | 24 +++++-- 4 files changed, 83 insertions(+), 33 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index bbdef481a2085..b64419f5ae6da 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -190,6 +190,7 @@ struct IntrinsicLibrary { mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genAtomicAddR2(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); + template <int extent> fir::ExtendedValue genAtomicAddVector(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index b9ea8b125b780..3156c8cb4332c 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -290,12 +290,12 @@ static constexpr IntrinsicHandler handlers[]{ {"atan2pi", &I::genAtanpi}, {"atand", &I::genAtand}, {"atanpi", &I::genAtanpi}, - {"atomicadd_r2x2", - &I::genAtomicAddVector, + {"atomicadd_r4x2", + &I::genAtomicAddVector<2>, {{{"a", asAddr}, {"v", asAddr}}}, false}, - {"atomicadd_r4x2", - &I::genAtomicAddVector, + {"atomicadd_r4x4", + &I::genAtomicAddVector<4>, {{{"a", asAddr}, {"v", asAddr}}}, false}, {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, @@ -306,6 +306,14 @@ static constexpr IntrinsicHandler handlers[]{ &I::genAtomicAddR2, {{{"a", asAddr}, {"v", asAddr}}}, false}, + {"atomicaddvector_r2x2", + &I::genAtomicAddVector<2>, + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddvector_r4x2", + &I::genAtomicAddVector<2>, + {{{"a", asAddr}, {"v", asAddr}}}, + false}, {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomiccasd", &I::genAtomicCas, @@ -3176,44 +3184,51 @@ IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, mlir::ArrayRef<int64_t>{0}); } +template <int extent> fir::ExtendedValue IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) { assert(args.size() == 2); mlir::Value res = fir::AllocaOp::create( - builder, loc, fir::SequenceType::get({2}, resultType)); + builder, loc, fir::SequenceType::get({extent}, resultType)); mlir::Value a = fir::getBase(args[0]); if (mlir::isa<fir::BaseBoxType>(a.getType())) { a = fir::BoxAddrOp::create(builder, loc, a); } - auto vecTy = mlir::VectorType::get({2}, resultType); + auto vecTy = mlir::VectorType::get({extent}, resultType); auto refTy = fir::ReferenceType::get(resultType); mlir::Type i32Ty = builder.getI32Type(); mlir::Type idxTy = builder.getIndexType(); - mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0); - mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); - mlir::Value v1Coord = fir::CoordinateOp::create(builder, loc, refTy, - fir::getBase(args[1]), zero); - mlir::Value v2Coord = fir::CoordinateOp::create(builder, loc, refTy, - fir::getBase(args[1]), one); - mlir::Value v1 = fir::LoadOp::create(builder, loc, v1Coord); - mlir::Value v2 = fir::LoadOp::create(builder, loc, v2Coord); + + // Extract the values from the array. + llvm::SmallVector<mlir::Value> values; + for (unsigned i = 0; i < extent; ++i) { + mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i); + mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), pos); + mlir::Value value = fir::LoadOp::create(builder, loc, coord); + values.push_back(value); + } + // Pack extracted values into a vector to call the atomic add. mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); - mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( - builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); - mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( - builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); + for (unsigned i = 0; i < extent; ++i) { + mlir::Value insert = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, values[i], + builder.createIntegerConstant(loc, i32Ty, i)); + undef = insert; + } + // Atomic operation with a vector of values. mlir::Value add = - genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); - mlir::Value r1 = mlir::LLVM::ExtractElementOp::create( - builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 0)); - mlir::Value r2 = mlir::LLVM::ExtractElementOp::create( - builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 1)); - mlir::Value c1 = fir::CoordinateOp::create(builder, loc, refTy, res, zero); - mlir::Value c2 = fir::CoordinateOp::create(builder, loc, refTy, res, one); - fir::StoreOp::create(builder, loc, r1, c1); - fir::StoreOp::create(builder, loc, r2, c2); - mlir::Value ext = builder.createIntegerConstant(loc, idxTy, 2); + genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef); + // Store results in the result array. + for (unsigned i = 0; i < extent; ++i) { + mlir::Value r = mlir::LLVM::ExtractElementOp::create( + builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i)); + mlir::Value c = fir::CoordinateOp::create( + builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i)); + fir::StoreOp::create(builder, loc, r, c); + } + mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent); return fir::ArrayBoxValue(res, {ext}); } diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index b1aef95cba8c9..27097193aaa9b 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1179,13 +1179,22 @@ attributes(device) pure integer(4) function atomicaddr2(address, val) end interface interface atomicaddvector - attributes(device) pure function atomicadd_r2x2(address, val) result(z) + attributes(device) pure function atomicaddvector_r2x2(address, val) result(z) !dir$ ignore_tkr (rd) address, (d) val real(2), dimension(2), intent(inout) :: address real(2), dimension(2), intent(in) :: val real(2), dimension(2) :: z end function + attributes(device) pure function atomicaddvector_r4x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(2), intent(inout) :: address + real(4), dimension(2), intent(in) :: val + real(4), dimension(2) :: z + end function + end interface + + interface atomicaddreal4x2 attributes(device) pure function atomicadd_r4x2(address, val) result(z) !dir$ ignore_tkr (rd) address, (d) val real(4), dimension(2), intent(inout) :: address @@ -1194,6 +1203,15 @@ attributes(device) pure function atomicadd_r4x2(address, val) result(z) end function end interface + interface atomicaddreal4x4 + attributes(device) pure function atomicadd_r4x4(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(4), intent(inout) :: address + real(4), dimension(4), intent(in) :: val + real(4), dimension(4) :: z + end function + end interface + interface atomicsub attributes(device) pure integer function atomicsubi(address, val) !dir$ ignore_tkr (d) address, (d) val diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf index baa6cdb3d5869..6669b4afa291d 100644 --- a/flang/test/Lower/CUDA/cuda-atomicadd.cuf +++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf @@ -2,18 +2,34 @@ ! Test CUDA Fortran atmoicadd functions available cudadevice module -attributes(global) subroutine atomicaddvector_r2() +attributes(global) subroutine test_atomicaddvector_r2() real(2), device :: a(2), tmp1(2), tmp2(2) tmp1 = atomicAddVector(a, tmp2) end subroutine -! CHECK-LABEL: func.func @_QPatomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} ! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16> -attributes(global) subroutine atomicaddvector_r4() +attributes(global) subroutine test_atomicaddvector_r4() real(4), device :: a(2), tmp1(2), tmp2(2) tmp1 = atomicAddVector(a, tmp2) end subroutine -! CHECK-LABEL: func.func @_QPatomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} ! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32> + +attributes(global) subroutine test_atomicadd_r2x4() + real(4), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicaddreal4x2(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicadd_r2x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32> + +attributes(global) subroutine test_atomicadd_r4x4() + real(4), device :: a(4), tmp1(4), tmp2(4) + tmp1 = atomicaddreal4x4(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicadd_r4x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<4xf32> From 6601c384d36e050ae007c691b2d0b9c479edbdf6 Mon Sep 17 00:00:00 2001 From: Greg Clayton <gclayton@fb.com> Date: Tue, 4 Nov 2025 09:36:54 -0800 Subject: [PATCH 195/313] Fix getting section info in large mach-o files. (#165940) Mach-o has 32 bit file offsets in the MachO::section_64 structs. dSYM files can contain sections whose start offset exceeds UINT32_MAX, which means the MachO::section_64.offset will get truncated. We can calculate when this happens and properly adjust the section offset to be 64 bit safe. This means tools can get the correct section contents for large dSYM files and allows tools that parse DWARF, like llvm-gsymutil, to be able to load and convert these files correctly. --- llvm/include/llvm/Object/MachO.h | 2 +- llvm/lib/Object/MachOObjectFile.cpp | 26 ++++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h index 01e7c6b07dd36..f4c1e30b097ee 100644 --- a/llvm/include/llvm/Object/MachO.h +++ b/llvm/include/llvm/Object/MachO.h @@ -447,7 +447,7 @@ class LLVM_ABI MachOObjectFile : public ObjectFile { uint64_t getSectionAddress(DataRefImpl Sec) const override; uint64_t getSectionIndex(DataRefImpl Sec) const override; uint64_t getSectionSize(DataRefImpl Sec) const override; - ArrayRef<uint8_t> getSectionContents(uint32_t Offset, uint64_t Size) const; + ArrayRef<uint8_t> getSectionContents(uint64_t Offset, uint64_t Size) const; Expected<ArrayRef<uint8_t>> getSectionContents(DataRefImpl Sec) const override; uint64_t getSectionAlignment(DataRefImpl Sec) const override; diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp index e09dc947c2779..c2f4560c06c0d 100644 --- a/llvm/lib/Object/MachOObjectFile.cpp +++ b/llvm/lib/Object/MachOObjectFile.cpp @@ -1978,20 +1978,42 @@ uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const { return SectSize; } -ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint32_t Offset, +ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint64_t Offset, uint64_t Size) const { return arrayRefFromStringRef(getData().substr(Offset, Size)); } Expected<ArrayRef<uint8_t>> MachOObjectFile::getSectionContents(DataRefImpl Sec) const { - uint32_t Offset; + uint64_t Offset; uint64_t Size; if (is64Bit()) { MachO::section_64 Sect = getSection64(Sec); Offset = Sect.offset; Size = Sect.size; + // Check for large mach-o files where the section contents might exceed + // 4GB. MachO::section_64 objects only have 32 bit file offsets to the + // section contents and can overflow in dSYM files. We can track this and + // adjust the section offset to be 64 bit safe. If sections overflow then + // section ordering is enforced. If sections are not ordered, then an error + // will be returned stopping invalid section data from being returned. + uint64_t PrevTrueOffset = 0; + uint64_t SectOffsetAdjust = 0; + for (uint32_t SectIdx = 0; SectIdx < Sec.d.a; ++SectIdx) { + MachO::section_64 CurrSect = + getStruct<MachO::section_64>(*this, Sections[SectIdx]); + uint64_t CurrTrueOffset = (uint64_t)CurrSect.offset + SectOffsetAdjust; + if ((SectOffsetAdjust > 0) && (PrevTrueOffset > CurrTrueOffset)) + return malformedError("section data exceeds 4GB and section file " + "offsets are not ordered"); + const uint64_t EndSectFileOffset = + (uint64_t)CurrSect.offset + CurrSect.size; + if (EndSectFileOffset > UINT32_MAX) + SectOffsetAdjust += EndSectFileOffset & 0xFFFFFFFF00000000ull; + PrevTrueOffset = CurrTrueOffset; + } + Offset += SectOffsetAdjust; } else { MachO::section Sect = getSection(Sec); Offset = Sect.offset; From ecd67a7a95b3ec1a8798ff8a50d6668ec0106a28 Mon Sep 17 00:00:00 2001 From: Amr Hesham <amr96@programmer.net> Date: Tue, 4 Nov 2025 18:40:42 +0100 Subject: [PATCH 196/313] [CIR] Upstream CXXDefaultArgExpr for AggregateExpr (#165991) Upstream the CXXDefaultArgExpr support for AggregateExpr --- clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp | 4 +- clang/test/CIR/CodeGen/struct.cpp | 44 +++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp index 3d3030ca87e2a..201fb73983155 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp @@ -343,8 +343,8 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> { cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitNoInitExpr"); } void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) { - cgf.cgm.errorNYI(dae->getSourceRange(), - "AggExprEmitter: VisitCXXDefaultArgExpr"); + CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae); + Visit(dae->getExpr()); } void VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *e) { cgf.cgm.errorNYI(e->getSourceRange(), diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp index c8db71498e477..ee543001025e7 100644 --- a/clang/test/CIR/CodeGen/struct.cpp +++ b/clang/test/CIR/CodeGen/struct.cpp @@ -344,3 +344,47 @@ void struct_with_const_member_expr() { // OGCG: %[[BF_SET:.*]] = or i8 %[[BF_CLEAR]], 0 // OGCG: store i8 %[[BF_SET]], ptr %[[REF_ADDR]], align 4 // OGCG: store i32 0, ptr %[[A_ADDR]], align 4 + +void function_arg_with_default_value(CompleteS a = {1, 2}) {} + +// CIR: %[[ARG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init] +// CIR: cir.store %{{.*}}, %[[ARG_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS> + +// LLVM: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4 +// LLVM: store %struct.CompleteS %{{.*}}, ptr %[[ARG_ADDR]], align 4 + +// OGCG: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, align 4 +// OGCG: store i64 %{{.*}}, ptr %[[ARG_ADDR]], align 4 + +void calling_function_with_default_values() { + function_arg_with_default_value(); +} + +// CIR: %[[AGG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["agg.tmp0"] +// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[AGG_ADDR]][0] {name = "a"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s32i> +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i +// CIR: cir.store{{.*}} %[[CONST_1]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i> +// CIR: %[[ELEM_1_PTR:.*]] = cir.get_member %[[AGG_ADDR]][1] {name = "b"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s8i> +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i +// CIR: %[[CONST_2_I8:.*]] = cir.cast integral %[[CONST_2]] : !s32i -> !s8i +// CIR: cir.store{{.*}} %[[CONST_2_I8]], %[[ELEM_1_PTR]] : !s8i, !cir.ptr<!s8i> +// CIR: %[[TMP_AGG:.*]] = cir.load{{.*}} %[[AGG_ADDR]] : !cir.ptr<!rec_CompleteS>, !rec_CompleteS +// CIR: cir.call @_Z31function_arg_with_default_value9CompleteS(%[[TMP_AGG]]) : (!rec_CompleteS) -> () + +// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering, + +// LLVM: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4 +// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0 +// LLVM: store i32 1, ptr %[[ELEM_0_PTR]], align 4 +// LLVM: %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1 +// LLVM: store i8 2, ptr %[[ELEM_1_PTR]], align 4 +// LLVM: %[[TMP_AGG:.*]] = load %struct.CompleteS, ptr %[[AGG_ADDR]], align 4 +// LLVM: call void @_Z31function_arg_with_default_value9CompleteS(%struct.CompleteS %[[TMP_AGG]]) + +// OGCG: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, align 4 +// OGCG: %[[ELEM_0_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0 +// OGCG: store i32 1, ptr %[[ELEM_0_PTR]], align 4 +// OGCG: %[[ELEM_1_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1 +// OGCG: store i8 2, ptr %[[ELEM_1_PTR]], align 4 +// OGCG: %[[TMP_AGG:.*]] = load i64, ptr %[[AGG_ADDR]], align 4 +// OGCG: call void @_Z31function_arg_with_default_value9CompleteS(i64 %[[TMP_AGG]]) From a02e5740119a4d13542126b124f2c464b23738d4 Mon Sep 17 00:00:00 2001 From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com> Date: Tue, 4 Nov 2025 18:41:01 +0100 Subject: [PATCH 197/313] [libc] Add faccessat entrypoints for aarch64 and riscv (#165869) Add faccessat entrypoints for aarch64 and riscv linux. Entrypoints are removed if faccessat2 syscall is not available. --- libc/config/linux/aarch64/entrypoints.txt | 3 +-- libc/config/linux/aarch64/exclude.txt | 8 ++++++++ libc/config/linux/riscv/entrypoints.txt | 1 + libc/config/linux/riscv/exclude.txt | 8 ++++++++ libc/config/linux/x86_64/exclude.txt | 1 + 5 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 libc/config/linux/aarch64/exclude.txt create mode 100644 libc/config/linux/riscv/exclude.txt diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 714120a79e39a..e0dd15b803253 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -325,8 +325,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.dup2 libc.src.unistd.dup3 libc.src.unistd.execve - # Disabled while SYS_faccessat2 is unavailable on the buildbot. - # libc.src.unistd.faccessat + libc.src.unistd.faccessat libc.src.unistd.fchdir libc.src.unistd.fpathconf libc.src.unistd.fsync diff --git a/libc/config/linux/aarch64/exclude.txt b/libc/config/linux/aarch64/exclude.txt new file mode 100644 index 0000000000000..f2f553f78933c --- /dev/null +++ b/libc/config/linux/aarch64/exclude.txt @@ -0,0 +1,8 @@ +include(CheckSymbolExists) +check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2) +if(NOT HAVE_SYS_FACCESSAT2) + message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system") + list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS + libc.src.unistd.faccessat + ) +endif() diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index f6bbb346d10e5..0d031d8844f13 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -329,6 +329,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.dup2 libc.src.unistd.dup3 libc.src.unistd.execve + libc.src.unistd.faccessat libc.src.unistd.fchdir libc.src.unistd.fpathconf libc.src.unistd.fsync diff --git a/libc/config/linux/riscv/exclude.txt b/libc/config/linux/riscv/exclude.txt new file mode 100644 index 0000000000000..f2f553f78933c --- /dev/null +++ b/libc/config/linux/riscv/exclude.txt @@ -0,0 +1,8 @@ +include(CheckSymbolExists) +check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2) +if(NOT HAVE_SYS_FACCESSAT2) + message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system") + list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS + libc.src.unistd.faccessat + ) +endif() diff --git a/libc/config/linux/x86_64/exclude.txt b/libc/config/linux/x86_64/exclude.txt index a0686310d21ac..31b60a9c3497c 100644 --- a/libc/config/linux/x86_64/exclude.txt +++ b/libc/config/linux/x86_64/exclude.txt @@ -23,6 +23,7 @@ endif() include(CheckSymbolExists) check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2) if(NOT HAVE_SYS_FACCESSAT2) + message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system") list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS libc.src.unistd.faccessat ) From 4ce58833d3653f0b15d5458b8430ec8cf25fdc16 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng <dtcxzyw2333@gmail.com> Date: Wed, 5 Nov 2025 01:43:05 +0800 Subject: [PATCH 198/313] [SimplifyCFG] Fix value enumeration of a full range (#166379) ConstantRange uses `[-1, -1)` as the canonical form of a full set. Therefore, the `for (APInt I = Lower; I != Upper; ++I)` idiom doesn't work for full ranges. This patch fixes the value enumeration in `ConstantComparesGatherer` to prevent missing values for full sets. Closes https://github.com/llvm/llvm-project/issues/166369. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 4 ++- llvm/test/Transforms/SimplifyCFG/pr166369.ll | 37 ++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SimplifyCFG/pr166369.ll diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 532511dcf91b0..3a3e3ade20212 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -778,8 +778,10 @@ struct ConstantComparesGatherer { return false; // Add all values from the range to the set - for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) + APInt Tmp = Span.getLower(); + do Vals.push_back(ConstantInt::get(I->getContext(), Tmp)); + while (++Tmp != Span.getUpper()); UsedICmps++; return true; diff --git a/llvm/test/Transforms/SimplifyCFG/pr166369.ll b/llvm/test/Transforms/SimplifyCFG/pr166369.ll new file mode 100644 index 0000000000000..c0a85c0293dd8 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/pr166369.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s + +; Make sure we handle full-set ranges correctly. +define void @test_i1() { +; CHECK-LABEL: define void @test_i1() { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: ret void +; +bb: + %icmp = icmp ugt i1 false, true + br label %bb5 + +bb5: + %select = select i1 %icmp, i1 %icmp, i1 false + br i1 %select, label %bb5, label %bb6 + +bb6: + ret void +} + +define void @test_i3() { +; CHECK-LABEL: define void @test_i3() { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: ret void +; +bb: + %icmp = icmp ugt i3 0, 7 + br label %bb5 + +bb5: + %select = select i1 %icmp, i1 %icmp, i1 false + br i1 %select, label %bb5, label %bb6 + +bb6: + ret void +} From 39221718519f2ea3710cc3f5940adb13639b4f80 Mon Sep 17 00:00:00 2001 From: alessandra simmons <alessandra@adrs.pub> Date: Tue, 4 Nov 2025 12:45:55 -0500 Subject: [PATCH 199/313] [clang][Driver][HIP] Change OffloadingActionBuilder to respect the --no-gpu-bundle-output flag (#163834) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the command `clang -c -emit-llvm --no-gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip square.hip` will lead to a bundled output: ``` ❯ ../bin/clang -c -emit-llvm --no-gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip square.hip ❯ ls square.hip square.bc ``` This doesn't match my expectation of the behavior of `--no-gpu-bundle-output`, so this adds a check into OffloadingActionBuilder for the flag when replacing the host compile action for a bundling action. --- clang/lib/Driver/Driver.cpp | 16 +++++++++---- clang/test/Driver/no-gpu-bundle-respected.hip | 24 +++++++++++++++++++ 2 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 clang/test/Driver/no-gpu-bundle-respected.hip diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 6f6a35b4c8c17..a0b82cec9a372 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -3857,6 +3857,9 @@ class OffloadingActionBuilder final { /// Flag set to true if all valid builders allow file bundling/unbundling. bool CanUseBundler; + /// Flag set to false if an argument turns off bundling. + bool ShouldUseBundler; + public: OffloadingActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs) @@ -3891,6 +3894,9 @@ class OffloadingActionBuilder final { } CanUseBundler = ValidBuilders && ValidBuilders == ValidBuildersSupportingBundling; + + ShouldUseBundler = Args.hasFlag(options::OPT_gpu_bundle_output, + options::OPT_no_gpu_bundle_output, true); } ~OffloadingActionBuilder() { @@ -4042,11 +4048,11 @@ class OffloadingActionBuilder final { SB->appendTopLevelActions(OffloadAL); } - // If we can use the bundler, replace the host action by the bundling one in - // the resulting list. Otherwise, just append the device actions. For - // device only compilation, HostAction is a null pointer, therefore only do - // this when HostAction is not a null pointer. - if (CanUseBundler && HostAction && + // If we can and should use the bundler, replace the host action by the + // bundling one in the resulting list. Otherwise, just append the device + // actions. For device only compilation, HostAction is a null pointer, + // therefore only do this when HostAction is not a null pointer. + if (CanUseBundler && ShouldUseBundler && HostAction && HostAction->getType() != types::TY_Nothing && !OffloadAL.empty()) { // Add the host action to the list in order to create the bundling action. OffloadAL.push_back(HostAction); diff --git a/clang/test/Driver/no-gpu-bundle-respected.hip b/clang/test/Driver/no-gpu-bundle-respected.hip new file mode 100644 index 0000000000000..fc93640dc4b90 --- /dev/null +++ b/clang/test/Driver/no-gpu-bundle-respected.hip @@ -0,0 +1,24 @@ +// RUN: %clang -ccc-print-phases -c -emit-llvm \ +// RUN: --offload-arch=gfx900,gfx1030 -O3 -x hip %s \ +// RUN: 2>&1 | FileCheck %s --check-prefix=BUNDLE + +// RUN: %clang -ccc-print-phases -c -emit-llvm \ +// RUN: --gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \ +// RUN: 2>&1 | FileCheck %s --check-prefix=BUNDLE + +// RUN: %clang -ccc-print-phases -c -emit-llvm \ +// RUN: --no-gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \ +// RUN: 2>&1 | FileCheck %s --check-prefixes=COMPILER,GFX1030,GFX900,OFFLOAD,NOBUNDLE + +// BUNDLE: clang-offload-bundler +// NOBUNDLE-NOT: clang-offload-bundler + +// COM: sanity checks +// COMPILER: compiler +// GFX1030: (device-hip, gfx1030) +// GFX900: (device-hip, gfx900) +// OFFLOAD: offload + +int square(int num) { + return num * num; +} From 1aa86ca521b8fea5ff728945d5ea5cdef97a6250 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" <jdenny.ornl@gmail.com> Date: Tue, 4 Nov 2025 12:49:33 -0500 Subject: [PATCH 200/313] [LoopUnroll] Fix division by zero (#166258) PR #159163's probability computation for epilogue loops does not handle the possibility of an original loop probability of one. Runtime loop unrolling does not make sense for such an infinite loop, and a division by zero results. This patch works around that case. Issue #165998. --- .../Transforms/Utils/LoopUnrollRuntime.cpp | 21 ++++ .../LoopUnroll/loop-probability-one.ll | 116 ++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 llvm/test/Transforms/LoopUnroll/loop-probability-one.ll diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 1e8f6cc76900c..6c9467bf4a005 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, /// probability of executing at least one more iteration? static BranchProbability probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // OriginalLoopProb == 1 would produce a division by zero in the calculation + // below. The problem is that case indicates an always infinite loop, but a + // remainder loop cannot be calculated at run time if the original loop is + // infinite as infinity % UnrollCount is undefined. We then choose + // probabilities indicating that all remainder loop iterations will always + // execute. + // + // Currently, the remainder loop here is an epilogue, which cannot be reached + // if the original loop is infinite, so the aforementioned choice is + // arbitrary. + // + // FIXME: Branch weights still need to be fixed in the case of prologues + // (issue #135812). In that case, the aforementioned choice seems reasonable + // for the goal of maintaining the original loop's block frequencies. That + // is, an infinite loop's initial iterations are not skipped, and the prologue + // loop body might have unique blocks that execute a finite number of times + // if, for example, the original loop body contains conditionals like i < + // UnrollCount. + if (OriginalLoopProb == BranchProbability::getOne()) + return BranchProbability::getOne(); + // Each of these variables holds the original loop's probability that the // number of iterations it will execute is some m in the specified range. BranchProbability ProbOne = OriginalLoopProb; // 1 <= m diff --git a/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll new file mode 100644 index 0000000000000..14f6da42df6b1 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll @@ -0,0 +1,116 @@ +; Check that a loop probability of one (indicating an always infinite loop) does +; not crash or otherwise break LoopUnroll behavior when it tries to compute new +; probabilities from it. +; +; That case indicates an always infinite loop. A remainder loop cannot be +; calculated at run time when the original loop is infinite as infinity % +; UnrollCount is undefined, so consistent remainder loop probabilities are +; difficult or impossible to reason about. The implementation chooses +; probabilities indicating that all remainder loop iterations will always +; execute. + +; DEFINE: %{unroll} = opt < %s -unroll-count=3 -passes=loop-unroll -S +; DEFINE: %{rt} = %{unroll} -unroll-runtime + +; RUN: %{unroll} | FileCheck %s -check-prefix UNROLL +; RUN: %{rt} -unroll-runtime-epilog=true | FileCheck %s -check-prefix EPILOG +; RUN: %{rt} -unroll-runtime-epilog=false | FileCheck %s -check-prefix PROLOG + +define void @test(i32 %n) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %inc, %loop ] + %inc = add i32 %i, 1 + %c = icmp slt i32 %inc, %n + br i1 %c, label %loop, label %end, !prof !0 + +end: + ret void +} + + +!0 = !{!"branch_weights", i32 1, i32 0} + +; UNROLL: define void @test(i32 %n) { +; UNROLL: entry: +; UNROLL: br label %loop +; UNROLL: loop: +; UNROLL: br i1 %c, label %loop.1, label %end, !prof !0 +; UNROLL: loop.1: +; UNROLL: br i1 %c.1, label %loop.2, label %end, !prof !0 +; UNROLL: loop.2: +; UNROLL: br i1 %c.2, label %loop, label %end, !prof !0, !llvm.loop !1 +; UNROLL-NOT: loop.3 +; UNROLL: end: +; UNROLL: ret void +; UNROLL: } +; +; Infinite unrolled loop. +; UNROLL: !0 = !{!"branch_weights", i32 1, i32 0} + +; EPILOG: define void @test(i32 %n) { +; EPILOG: entry: +; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %entry.new, !prof !0 +; EPILOG: entry.new: +; EPILOG: br label %loop +; EPILOG: loop: +; EPILOG: br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !1 +; EPILOG: end.unr-lcssa: +; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %end, !prof !1 +; EPILOG: loop.epil.preheader: +; EPILOG: br label %loop.epil +; EPILOG: loop.epil: +; EPILOG: br i1 %{{.*}}, label %loop.epil, label %end.epilog-lcssa, !prof !4 +; EPILOG: end.epilog-lcssa: +; EPILOG: br label %end +; EPILOG: end: +; EPILOG: ret void +; EPILOG: } +; +; Unrolled loop guard: Unrolled loop is always entered. +; EPILOG: !0 = !{!"branch_weights", i32 0, i32 -2147483648} +; +; Unrolled loop latch: Unrolled loop is infinite. +; Epilogue loop guard: Epilogue loop is always entered if unrolled loop exits. +; EPILOG: !1 = !{!"branch_weights", i32 -2147483648, i32 0} +; +; Epilogue loop latch: Epilogue loop executes both of its 2 iterations. +; EPILOG: !4 = !{!"branch_weights", i32 1073741824, i32 1073741824} + +; PROLOG: define void @test(i32 %n) { +; PROLOG: entry: +; PROLOG: br i1 %{{.*}}, label %loop.prol.preheader, label %loop.prol.loopexit, !prof !0 +; PROLOG: loop.prol.preheader: +; PROLOG: br label %loop.prol +; PROLOG: loop.prol: +; PROLOG: br i1 %{{.*}}, label %loop.prol, label %loop.prol.loopexit.unr-lcssa, !prof !1 +; PROLOG: loop.prol.loopexit.unr-lcssa: +; PROLOG: br label %loop.prol.loopexit +; PROLOG: loop.prol.loopexit: +; PROLOG: br i1 %{{.*}}, label %end, label %entry.new, !prof !0 +; PROLOG: entry.new: +; PROLOG: br label %loop +; PROLOG: loop: +; PROLOG: br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !4 +; PROLOG: end.unr-lcssa: +; PROLOG: br label %end +; PROLOG: end: +; PROLOG: ret void +; PROLOG: } +; +; FIXME: Branch weights still need to be fixed in the case of prologues (issue +; #135812), so !0 and !1 do not yet match their comments below. When we do +; fix it, this test will hopefully catch any bug like issue #165998, which +; impacted the case of epilogues. +; +; Prologue loop guard: Prologue loop is always entered. +; Unrolled loop guard: Unrolled loop is always entered. +; PROLOG: !0 = !{!"branch_weights", i32 1, i32 127} +; +; Prologue loop latch: Prologue loop executes both of its 2 iterations. +; PROLOG: !1 = !{!"branch_weights", i32 0, i32 1} +; +; Unrolled loop latch: Unrolled loop is infinite. +; PROLOG: !4 = !{!"branch_weights", i32 1, i32 0} From 1409db663139a644871362ffb23d725078bc84cf Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena <usx@google.com> Date: Tue, 4 Nov 2025 12:53:58 -0500 Subject: [PATCH 201/313] [LifetimeSafety] Fix Python path for Windows compatibility (#166291) Fix Python virtual environment paths for Windows in the Lifetime Safety Analysis benchmark ### What changed? - Added conditional path setting for the Python executable in the virtual environment based on the platform - For Windows, use `Scripts/python` path - For other platforms, use `bin/python` path - Updated the commands that use the Python virtual environment to use the platform-specific path ### How to test? `ninja benchmark_lifetime_safety_analysis` Fixes #166143 --- clang/test/Analysis/LifetimeSafety/CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt index ce37a29655668..2f9c2ac247497 100644 --- a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt +++ b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt @@ -15,6 +15,13 @@ set(LIFETIME_BENCHMARK_REQUIREMENTS set(LIFETIME_BENCHMARK_OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/benchmark_results") +if(WIN32) + set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE + "${LIFETIME_BENCHMARK_VENV_DIR}/Scripts/python") +else() + set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE + "${LIFETIME_BENCHMARK_VENV_DIR}/bin/python") +endif() if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREMENTS}) @@ -22,7 +29,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME add_custom_command( OUTPUT ${LIFETIME_BENCHMARK_VENV_DIR}/pyvenv.cfg COMMAND ${Python3_EXECUTABLE} -m venv ${LIFETIME_BENCHMARK_VENV_DIR} - COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS} + COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS} DEPENDS ${LIFETIME_BENCHMARK_REQUIREMENTS} COMMENT "Creating Python virtual environment and installing dependencies for benchmark..." ) @@ -32,7 +39,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME # Main benchmark target add_custom_target(benchmark_lifetime_safety_analysis - COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python ${LIFETIME_BENCHMARK_SCRIPT} + COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} ${LIFETIME_BENCHMARK_SCRIPT} --clang-binary ${LLVM_BINARY_DIR}/bin/clang --output-dir ${LIFETIME_BENCHMARK_OUTPUT_DIR} From 8aff0d99d6081355eeba2ed4e0a5484db0f4170b Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht <rupprecht@google.com> Date: Tue, 4 Nov 2025 11:54:43 -0600 Subject: [PATCH 202/313] [test] Avoid writing test output to readonly dir (#166404) Omitting `-o /dev/null` may try to write output to the current dir, which may not have write permissions on some build systems. This fixes the test added by #165737 --- llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll index d3853e2fdaa88..4d81fdc67736d 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll @@ -1,4 +1,4 @@ -; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s 2>&1 | FileCheck %s +; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s -o /dev/null 2>&1 | FileCheck %s ; Test that we get a clear error message when using an unsupported syncscope. From 208b7360152bdc92a9089efaadd2167549b73908 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <llvm-dev@redking.me.uk> Date: Tue, 4 Nov 2025 17:55:06 +0000 Subject: [PATCH 203/313] [X86] select-copy.mir - regenerate test checks. NFC (#166405) --- .../CodeGen/X86/GlobalISel/select-copy.mir | 136 +++++++++--------- 1 file changed, 70 insertions(+), 66 deletions(-) diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir index 41e1b5bf22bf1..5c059a4e0539d 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir @@ -1,5 +1,6 @@ -# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 -# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86 +# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64 --- | @@ -30,24 +31,23 @@ ... --- name: test_copy -# ALL-LABEL: name: test_copy alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } -# ALL: %0:gr8 = COPY $al -# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0 -# ALL-NEXT: $eax = COPY %1 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al + ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s8) = COPY $al %1(s32) = G_ZEXT %0(s8) $eax = COPY %1(s32) @@ -56,24 +56,23 @@ body: | ... --- name: test_copy2 -# ALL-LABEL: name: test_copy2 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } -# ALL: %0:gr8 = COPY $al -# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0 -# ALL-NEXT: $eax = COPY %1 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy2 + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al + ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s8) = COPY $al %1(s32) = G_ZEXT %0(s8) $eax = COPY %1(s32) @@ -82,30 +81,35 @@ body: | ... --- name: test_copy3 -# ALL-LABEL: name: test_copy3 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] } -# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] } -# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr16 = COPY $ax -# X32-NEXT: %3:gr16_abcd = COPY %0 -# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit -# X64-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; X86-LABEL: name: test_copy3 + ; X86: liveins: $eax + ; X86-NEXT: {{ $}} + ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax + ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]] + ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit + ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]] + ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X86-NEXT: RET 0, implicit $eax + ; + ; X64-LABEL: name: test_copy3 + ; X64: liveins: $eax + ; X64-NEXT: {{ $}} + ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax + ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit + ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]] + ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X64-NEXT: RET 0, implicit $eax %0(s16) = COPY $ax %1(s8) = G_TRUNC %0(s16) %2(s32) = G_ZEXT %1(s8) @@ -115,27 +119,25 @@ body: | ... --- name: test_copy4 -# ALL-LABEL: name: test_copy4 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $eax -# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %2:gr32 = MOVZX32rr16 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy4 + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s32) = COPY $eax %1(s16) = G_TRUNC %0(s32) %2(s32) = G_ZEXT %1(s16) @@ -145,30 +147,35 @@ body: | ... --- name: test_copy5 -# ALL-LABEL: name: test_copy5 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] } -# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] } -# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $edx -# X32-NEXT: %3:gr32_abcd = COPY %0 -# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit -# X64-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax,$edx + ; X86-LABEL: name: test_copy5 + ; X86: liveins: $eax, $edx + ; X86-NEXT: {{ $}} + ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]] + ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit + ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]] + ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X86-NEXT: RET 0, implicit $eax + ; + ; X64-LABEL: name: test_copy5 + ; X64: liveins: $eax, $edx + ; X64-NEXT: {{ $}} + ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit + ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]] + ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X64-NEXT: RET 0, implicit $eax %0(s32) = COPY $edx %1(s8) = G_TRUNC %0(s32) %2(s32) = G_ANYEXT %1(s8) @@ -178,29 +185,26 @@ body: | ... --- name: test_copy6 -# ALL-LABEL: name: test_copy6 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $edx -# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %3:low32_addr_access_rbp = IMPLICIT_DEF -# ALL-NEXT: %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax,$edx + ; CHECK-LABEL: name: test_copy6 + ; CHECK: liveins: $eax, $edx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit + ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s32) = COPY $edx %1(s16) = G_TRUNC %0(s32) %2(s32) = G_ANYEXT %1(s16) From 8208591f274c831e0f72003eb466ee4076c7dbe7 Mon Sep 17 00:00:00 2001 From: Craig Topper <craig.topper@sifive.com> Date: Tue, 4 Nov 2025 09:56:59 -0800 Subject: [PATCH 204/313] [RISCV] Use TargetConstant for the immediate RISCVISD::SHL_ADD nodes. (#166312) This is consistent with some other nodes that require a constant. Particularly intrinsics with ImmArg. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 32 ++++++++++--------- llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 6 ++-- llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 6 ++-- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c56ce3fd2a5a4..b8605629e2dfe 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9186,7 +9186,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG, unsigned ShAmount = Log2_64(TrueM1); if (Subtarget.hasShlAdd(ShAmount)) return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, CondV, - DAG.getConstant(ShAmount, DL, VT), CondV); + DAG.getTargetConstant(ShAmount, DL, VT), CondV); } } // (select c, y, 0) -> -c & y @@ -15463,7 +15463,7 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG, SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL, - DAG.getConstant(Diff, DL, VT), NS); + DAG.getTargetConstant(Diff, DL, VT), NS); return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT)); } @@ -15501,7 +15501,7 @@ static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other, int64_t AddConst = AddVal.getSExtValue(); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0), - DAG.getConstant(ShlConst, DL, VT), Other); + DAG.getTargetConstant(ShlConst, DL, VT), Other); return DAG.getNode(ISD::ADD, DL, VT, SHADD, DAG.getSignedConstant(AddConst, DL, VT)); } @@ -16501,9 +16501,9 @@ static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX, EVT VT = N->getValueType(0); SDValue X = N->getOperand(0); SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShY, DL, VT), X); + DAG.getTargetConstant(ShY, DL, VT), X); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(ShX, DL, VT), Mul359); + DAG.getTargetConstant(ShX, DL, VT), Mul359); } static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG, @@ -16571,12 +16571,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT)); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, - DAG.getConstant(ShXAmount, DL, VT), Shl); + DAG.getTargetConstant(ShXAmount, DL, VT), Shl); } // Otherwise, put the shl second so that it can fold with following // instructions (e.g. sext or add). SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShXAmount, DL, VT), X); + DAG.getTargetConstant(ShXAmount, DL, VT), X); return DAG.getNode(ISD::SHL, DL, VT, Mul359, DAG.getConstant(Shift, DL, VT)); } @@ -16596,7 +16596,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), Shift1); + DAG.getTargetConstant(ScaleShift, DL, VT), Shift1); } } @@ -16609,10 +16609,11 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, assert(Shift != 0 && "MulAmt=4,6,10 handled before"); if (Shift <= 3) { SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShXAmount, DL, VT), X); + SDValue Mul359 = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ShXAmount, DL, VT), X); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(Shift, DL, VT), X); + DAG.getTargetConstant(Shift, DL, VT), X); } } @@ -16624,9 +16625,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); - return DAG.getNode(ISD::ADD, DL, VT, Shift1, - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), X)); + return DAG.getNode( + ISD::ADD, DL, VT, Shift1, + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ScaleShift, DL, VT), X)); } } @@ -16641,7 +16643,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT)); SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Offset - 1), DL, VT), X); + DAG.getTargetConstant(Log2_64(Offset - 1), DL, VT), X); return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359); } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td index b37ceaaee9cf4..c2b25c6294019 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td @@ -60,6 +60,8 @@ def immfour : RISCVOp { let DecoderMethod = "decodeImmFourOperand"; } +def tuimm2 : TImmLeaf<XLenVT, [{return isUInt<2>(Imm);}]>; + //===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// @@ -557,8 +559,8 @@ multiclass VPatTernaryVMAQA_VV_VX<string intrinsic, string instruction, let Predicates = [HasVendorXTHeadBa] in { def : Pat<(add_like_non_imm12 (shl GPR:$rs2, uimm2:$uimm2), (XLenVT GPR:$rs1)), (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>; -def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)), - (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>; +def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, tuimm2:$uimm2, GPR:$rs1)), + (TH_ADDSL GPR:$rs1, GPR:$rs2, tuimm2:$uimm2)>; // Reuse complex patterns from StdExtZba def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 4537bfe8025ca..8376da52be53e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -53,6 +53,8 @@ def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT, let OperandType = "OPERAND_UIMM5_GT3"; } +def tuimm5gt3 : TImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]>; + def UImm5Plus1AsmOperand : AsmOperandClass { let Name = "UImm5Plus1"; let RenderMethod = "addImmOperands"; @@ -1419,8 +1421,8 @@ def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))), (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>; def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)), (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; -def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)), - (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; +def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 tuimm5gt3:$imm), GPRNoX0:$rs2)), + (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, tuimm5gt3:$imm)>; } // Predicates = [HasVendorXqciac, IsRV32] /// Simple arithmetic operations From 6217f351bb8862a224e8b4434fa6376d803c292f Mon Sep 17 00:00:00 2001 From: actink <actink@163.com> Date: Wed, 5 Nov 2025 02:03:02 +0800 Subject: [PATCH 205/313] [NFC][AMDGPU] use DAG.UpdateNodeOperands update chain (#166396) --- llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c920a046..31eca049fd149 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1129,12 +1129,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); - // TODO: can the chain be replaced without creating a new store? - SDValue NewStore = DAG.getTruncStore( - NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT, - StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(), - StoreNode->getAAInfo()); - StoreNode = cast<StoreSDNode>(NewStore); + SmallVector<SDValue, 4> NewOps(StoreNode->ops()); + NewOps[0] = NewChain; + StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps)); } return scalarizeVectorStore(StoreNode, DAG); From fb21f16fe6fd1a1fa03662510bde042309ac8ae1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Tue, 4 Nov 2025 10:06:29 -0800 Subject: [PATCH 206/313] RuntimeLibcalls: Add stub API for getting function signatures (#166290) Eventually this should be generated by tablegen for all functions. For now add a manually implementation for sincos_stret, which I have an immediate use for. This will allow pulling repeated code across targets into shared call sequence code. Also add sqrt just to make sure we can handle adding return attributes on the declaration. --- llvm/include/llvm/IR/RuntimeLibcalls.h | 7 ++ llvm/lib/IR/RuntimeLibcalls.cpp | 79 +++++++++++++++++++ .../Utils/DeclareRuntimeLibcalls.cpp | 48 ++++++++++- .../Util/DeclareRuntimeLibcalls/basic.ll | 4 + .../merge_attributes.ll | 11 +++ .../DeclareRuntimeLibcalls/sincos_stret.ll | 23 ++++++ .../wrong_declaration.ll | 21 +++++ 7 files changed, 189 insertions(+), 4 deletions(-) create mode 100644 llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll create mode 100644 llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll create mode 100644 llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 01359894b0421..ab14ed44fed52 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -186,6 +186,13 @@ struct RuntimeLibcallsInfo { return RTLIB::Unsupported; } + /// \returns the function type and attributes for the \p LibcallImpl, + /// depending on the target \p TT. If the function has incomplete type + /// information, return nullptr for the function type. + std::pair<FunctionType *, AttributeList> + getFunctionTy(LLVMContext &Ctx, const Triple &TT, const DataLayout &DL, + RTLIB::LibcallImpl LibcallImpl) const; + private: LLVM_ABI static iota_range<RTLIB::LibcallImpl> lookupLibcallImplNameImpl(StringRef Name); diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 77af29b9d70f6..2ce5719228a0d 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -7,7 +7,9 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/RuntimeLibcalls.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/StringTable.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Support/Debug.h" #include "llvm/Support/xxhash.h" #include "llvm/TargetParser/ARMTargetParser.h" @@ -72,3 +74,80 @@ bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { return false; } } + +std::pair<FunctionType *, AttributeList> +RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT, + const DataLayout &DL, + RTLIB::LibcallImpl LibcallImpl) const { + static constexpr Attribute::AttrKind CommonFnAttrs[] = { + Attribute::NoCallback, Attribute::NoFree, Attribute::NoSync, + Attribute::NoUnwind, Attribute::WillReturn}; + + switch (LibcallImpl) { + case RTLIB::impl___sincos_stret: + case RTLIB::impl___sincosf_stret: { + if (!darwinHasSinCosStret(TT)) // Non-darwin currently unexpected + return {}; + + Type *ScalarTy = LibcallImpl == RTLIB::impl___sincosf_stret + ? Type::getFloatTy(Ctx) + : Type::getDoubleTy(Ctx); + + AttrBuilder FuncAttrBuilder(Ctx); + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + + const bool UseSret = + TT.isX86_32() || ((TT.isARM() || TT.isThumb()) && + ARM::computeTargetABI(TT) == ARM::ARM_ABI_APCS); + + FuncAttrBuilder.addMemoryAttr(MemoryEffects::argumentOrErrnoMemOnly( + UseSret ? ModRefInfo::Mod : ModRefInfo::NoModRef, ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + if (UseSret) { + AttrBuilder AttrBuilder(Ctx); + StructType *StructTy = StructType::get(ScalarTy, ScalarTy); + AttrBuilder.addStructRetAttr(StructTy); + AttrBuilder.addAlignmentAttr(DL.getABITypeAlign(StructTy)); + FunctionType *FuncTy = FunctionType::get( + Type::getVoidTy(Ctx), {DL.getAllocaPtrType(Ctx), ScalarTy}, false); + + return {FuncTy, Attrs.addParamAttributes(Ctx, 0, AttrBuilder)}; + } + + Type *RetTy = + LibcallImpl == RTLIB::impl___sincosf_stret && TT.isX86_64() + ? static_cast<Type *>(FixedVectorType::get(ScalarTy, 2)) + : static_cast<Type *>(StructType::get(ScalarTy, ScalarTy)); + + return {FunctionType::get(RetTy, {ScalarTy}, false), Attrs}; + } + case RTLIB::impl_sqrtf: + case RTLIB::impl_sqrt: { + AttrBuilder FuncAttrBuilder(Ctx); + + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + FuncAttrBuilder.addMemoryAttr(MemoryEffects::errnoMemOnly(ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + Type *ScalarTy = LibcallImpl == RTLIB::impl_sqrtf ? Type::getFloatTy(Ctx) + : Type::getDoubleTy(Ctx); + FunctionType *FuncTy = FunctionType::get(ScalarTy, {ScalarTy}, false); + + Attrs = Attrs.addRetAttribute( + Ctx, Attribute::getWithNoFPClass(Ctx, fcNegInf | fcNegSubnormal | + fcNegNormal)); + return {FuncTy, Attrs}; + } + default: + return {}; + } + + return {}; +} diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp index 0642d51cd2c21..6d4436b92c119 100644 --- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp +++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp @@ -16,22 +16,62 @@ using namespace llvm; +static void mergeAttributes(LLVMContext &Ctx, const Module &M, + const DataLayout &DL, const Triple &TT, + Function *Func, FunctionType *FuncTy, + AttributeList FuncAttrs) { + AttributeList OldAttrs = Func->getAttributes(); + AttributeList NewAttrs = OldAttrs; + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder); + } + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder); + } + + for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) { + AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I)); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I)); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder); + } + + Func->setAttributes(NewAttrs); +} + PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M, ModuleAnalysisManager &MAM) { RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple()); LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + const Triple &TT = M.getTargetTriple(); for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) { if (Impl == RTLIB::Unsupported) continue; - // TODO: Declare with correct type, calling convention, and attributes. + auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl); - FunctionType *FuncTy = - FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); + // TODO: Declare with correct type, calling convention, and attributes. + if (!FuncTy) + FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); StringRef FuncName = RTLCI.getLibcallImplName(Impl); - M.getOrInsertFunction(FuncName, FuncTy); + + Function *Func = + cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee()); + if (Func->getFunctionType() == FuncTy) { + mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs); + Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl)); + } } return PreservedAnalyses::none(); diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll index ee3a0539bf300..c005316f07f06 100644 --- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll @@ -11,5 +11,9 @@ define float @sinf(float %x) { } ; CHECK: declare void @acosf(...) + +; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]] +; CHECK: declare nofpclass(ninf nsub nnorm) double @sqrt(double) [[SQRT_ATTRS:#[0-9]+]] + ; CHECK: declare void @__umodti3(...) diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll new file mode 100644 index 0000000000000..ffbf11d4106dc --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll @@ -0,0 +1,11 @@ +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define noundef nofpclass(nan) float @sqrtf(float %x) "foo" { + %ret = call float asm "; $0 = sqrt($1)", "=r,r"(float %x) + ret float %ret +} + +; FIXME: Individual fields of nofpclass not merged +; CHECK: define noundef nofpclass(ninf nsub nnorm) float @sqrtf(float %x) [[SQRT_ATTR:#[0-9]+]] { + +; CHECK: attributes [[SQRT_ATTR]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) "foo" } diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll new file mode 100644 index 0000000000000..0d0e3da25eea7 --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll @@ -0,0 +1,23 @@ +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s + +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s + +; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]] +; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]] + +; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]] +; STRUCT: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]] + +; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]] +; SRET: declare void @__sincos_stret(ptr sret({ double, double }) align 4, double) [[SINCOS_ATTRS:#[0-9]+]] + +; CHECK: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) } +; SRET: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write, errnomem: write) } + +; NONE-NOT: __sincos_stret +; NONE-NOT: __sincosf_stret diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll new file mode 100644 index 0000000000000..2451010df5b75 --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll @@ -0,0 +1,21 @@ +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck %s + +; Make sure there is no crash if there are definitions or declarations +; with the wrong type signature. + +; CHECK: define void @sqrtf() { +define void @sqrtf() { + ret void +} + +; CHECK: define float @sqrt(float %0) { +define float @sqrt(float) { + ret float 0.0 +} + +; CHECK: declare double @__sincos_stret(double) +declare double @__sincos_stret(double) + +; CHECK: declare { float, float } @__sincosf_stret(float) +declare { float, float } @__sincosf_stret(float) + From 8ee1803538f90e63ee81cc6f51ff294925f83192 Mon Sep 17 00:00:00 2001 From: Amr Hesham <amr96@programmer.net> Date: Tue, 4 Nov 2025 19:12:41 +0100 Subject: [PATCH 207/313] [CIR] Upstream Builtin ExpOp (#166061) Upstream the Builtin ExpOp --- clang/include/clang/CIR/Dialect/IR/CIROps.td | 10 +++++++++ clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 11 ++++++++++ .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 8 +++++++ .../CIR/CodeGen/builtins-floating-point.c | 21 +++++++++++++++++++ 4 files changed, 50 insertions(+) diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index dc56db1bbd4ea..6f9a69e697cc3 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -4191,6 +4191,16 @@ def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> { }]; } +def CIR_ExpOp : CIR_UnaryFPToFPBuiltinOp<"exp", "ExpOp"> { + let summary = "Computes the floating-point base-e exponential value"; + let description = [{ + `cir.exp` computes the exponential of a floating-point operand and returns + a result of the same type. + + Floating-point exceptions are ignored, and it does not set `errno`. + }]; +} + def CIR_FAbsOp : CIR_UnaryFPToFPBuiltinOp<"fabs", "FAbsOp"> { let summary = "Computes the floating-point absolute value"; let description = [{ diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index d9b9e3b877b50..0803910f2e83a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -222,6 +222,17 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, assert(!cir::MissingFeatures::fastMathFlags()); return emitUnaryMaybeConstrainedFPBuiltin<cir::CeilOp>(*this, *e); + case Builtin::BIexp: + case Builtin::BIexpf: + case Builtin::BIexpl: + case Builtin::BI__builtin_exp: + case Builtin::BI__builtin_expf: + case Builtin::BI__builtin_expf16: + case Builtin::BI__builtin_expl: + case Builtin::BI__builtin_expf128: + assert(!cir::MissingFeatures::fastMathFlags()); + return emitUnaryMaybeConstrainedFPBuiltin<cir::ExpOp>(*this, *e); + case Builtin::BIfabs: case Builtin::BIfabsf: case Builtin::BIfabsl: diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index d94108294a9a3..ba967a43ce59a 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -194,6 +194,14 @@ mlir::LogicalResult CIRToLLVMCosOpLowering::matchAndRewrite( return mlir::success(); } +mlir::LogicalResult CIRToLLVMExpOpLowering::matchAndRewrite( + cir::ExpOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + mlir::Type resTy = typeConverter->convertType(op.getType()); + rewriter.replaceOpWithNewOp<mlir::LLVM::ExpOp>(op, resTy, adaptor.getSrc()); + return mlir::success(); +} + static mlir::Value getLLVMIntCast(mlir::ConversionPatternRewriter &rewriter, mlir::Value llvmSrc, mlir::Type llvmDstIntTy, bool isUnsigned, uint64_t cirSrcWidth, diff --git a/clang/test/CIR/CodeGen/builtins-floating-point.c b/clang/test/CIR/CodeGen/builtins-floating-point.c index 8bdc43c59dc6f..1b7de650662c7 100644 --- a/clang/test/CIR/CodeGen/builtins-floating-point.c +++ b/clang/test/CIR/CodeGen/builtins-floating-point.c @@ -25,3 +25,24 @@ float ceil(float f) { // LLVM: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}}) // OGCG: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}}) } + +float expf(float f) { + return __builtin_expf(f); + // CIR: %{{.*}} = cir.exp {{.*}} : !cir.float + // LLVM: %{{.*}} = call float @llvm.exp.f32(float %{{.*}}) + // OGCG: %{{.*}} = call float @llvm.exp.f32(float %{{.*}}) +} + +double exp(double f) { + return __builtin_exp(f); + // CIR: %{{.*}} = cir.exp {{.*}} : !cir.double + // LLVM: %{{.*}} = call double @llvm.exp.f64(double %{{.*}}) + // OGCG: %{{.*}} = call double @llvm.exp.f64(double %{{.*}}) +} + +long double expl(long double f) { + return __builtin_expl(f); + // CIR: %{{.*}} = cir.exp {{.*}} : !cir.long_double<!cir.f128> + // LLVM: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}}) + // OGCG: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}}) +} From fe106b6e73086f54ca880fee393fcafc45e8209c Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Tue, 4 Nov 2025 10:19:32 -0800 Subject: [PATCH 208/313] BasicTTI: Cleanup multiple result intrinsic handling (#165970) Avoid weird lambda returning function pointer and sink the libcall logic to where the operation is handled. This allows chaining the libcall logic to try sincos_stret and fallback to sincos. The resulting cost seems too low. --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 49 +++++++++++-------- .../test/Analysis/CostModel/AArch64/sincos.ll | 21 ++++++-- 2 files changed, 46 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index e8dbc964a943e..221d8f1e2f673 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -302,7 +302,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { /// (e.g. scalarization). std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost( const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind, - RTLIB::Libcall LC, std::optional<unsigned> CallRetElementIndex = {}) const { Type *RetTy = ICA.getReturnType(); // Vector variants of the intrinsic can be mapped to a vector library call. @@ -311,12 +310,38 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { !isVectorizedStructTy(cast<StructType>(RetTy))) return std::nullopt; + Type *Ty = getContainedTypes(RetTy).front(); + EVT VT = getTLI()->getValueType(DL, Ty); + + EVT ScalarVT = VT.getScalarType(); + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + + switch (ICA.getID()) { + case Intrinsic::modf: + LC = RTLIB::getMODF(ScalarVT); + break; + case Intrinsic::sincospi: + LC = RTLIB::getSINCOSPI(ScalarVT); + break; + case Intrinsic::sincos: + LC = RTLIB::getSINCOS(ScalarVT); + break; + default: + return std::nullopt; + } + // Find associated libcall. - const char *LCName = getTLI()->getLibcallName(LC); - if (!LCName) + RTLIB::LibcallImpl LibcallImpl = getTLI()->getLibcallImpl(LC); + if (LibcallImpl == RTLIB::Unsupported) return std::nullopt; + StringRef LCName = + RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LibcallImpl); + // Search for a corresponding vector variant. + // + // FIXME: Should use RuntimeLibcallsInfo, not TargetLibraryInfo to get the + // vector mapping. LLVMContext &Ctx = RetTy->getContext(); ElementCount VF = getVectorizedTypeVF(RetTy); VecDesc const *VD = nullptr; @@ -2137,22 +2162,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { case Intrinsic::modf: case Intrinsic::sincos: case Intrinsic::sincospi: { - Type *Ty = getContainedTypes(RetTy).front(); - EVT VT = getTLI()->getValueType(DL, Ty); - - RTLIB::Libcall LC = [&] { - switch (ICA.getID()) { - case Intrinsic::modf: - return RTLIB::getMODF; - case Intrinsic::sincos: - return RTLIB::getSINCOS; - case Intrinsic::sincospi: - return RTLIB::getSINCOSPI; - default: - llvm_unreachable("unexpected intrinsic"); - } - }()(VT.getScalarType()); - std::optional<unsigned> CallRetElementIndex; // The first element of the modf result is returned by value in the // libcall. @@ -2160,7 +2169,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { CallRetElementIndex = 0; if (auto Cost = getMultipleResultIntrinsicVectorLibCallCost( - ICA, CostKind, LC, CallRetElementIndex)) + ICA, CostKind, CallRetElementIndex)) return *Cost; // Otherwise, fallback to default scalarization cost. break; diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll index 32408acb582d0..72c8f2bbbf8cf 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos" ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB +; RUN: opt < %s -mtriple=arm64-apple-macos10.9 -mattr=+neon -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefix=SINCOS_STRET %s define void @sincos() { ; CHECK-LABEL: 'sincos' @@ -8,13 +9,11 @@ define void @sincos() { ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) -; ; CHECK: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) ; CHECK: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) ; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) ; CHECK: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) -; ; CHECK: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) @@ -26,18 +25,32 @@ define void @sincos() { ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) -; ; CHECK-VECLIB: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) -; ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison) ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison) +; +; SINCOS_STRET-LABEL: 'sincos' +; SINCOS_STRET: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 20 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison) ; %f16 = call { half, half } @llvm.sincos.f16(half poison) %f32 = call { float, float } @llvm.sincos.f32(float poison) From 831e79adff4506a0b22a770dcaa46bf5a37257cb Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Tue, 4 Nov 2025 10:20:00 -0800 Subject: [PATCH 209/313] DAG: Merge all sincos_stret emission code into legalizer (#166295) This avoids AArch64 legality rules depending on libcall availability. ARM, AArch64, and X86 all had custom lowering of fsincos which all were just to emit calls to sincos_stret / sincosf_stret. This messes with the cost heuristics around legality, because really it's an expand/libcall cost and not a favorable custom. This is a bit ugly, because we're emitting code trying to match the C ABI lowered IR type for the aggregate return type. This now also gives an easy way to lift the unhandled x86_32 darwin case, since ARM already handled the return as sret case. --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 109 ++++++++++++++++++ .../Target/AArch64/AArch64ISelLowering.cpp | 43 +------ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 - llvm/lib/Target/ARM/ARMISelLowering.cpp | 78 +------------ llvm/lib/Target/ARM/ARMISelLowering.h | 1 - llvm/lib/Target/X86/X86ISelLowering.cpp | 60 +--------- 6 files changed, 118 insertions(+), 174 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 431a81002074f..316aacdf6978e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -163,6 +163,8 @@ class SelectionDAGLegalize { RTLIB::Libcall CallI128); void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results); + SDValue ExpandSincosStretLibCall(SDNode *Node) const; + SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, const SDLoc &dl); SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, @@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) { return false; } +SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const { + // For iOS, we want to call an alternative entry point: __sincos_stret, + // which returns the values in two S / D registers. + SDLoc dl(Node); + SDValue Arg = Node->getOperand(0); + EVT ArgVT = Arg.getValueType(); + RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); + RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC); + if (SincosStret == RTLIB::Unsupported) + return SDValue(); + + /// There are 3 different ABI cases to handle: + /// - Direct return of separate fields in registers + /// - Single return as vector elements + /// - sret struct + + const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo(); + + const DataLayout &DL = DAG.getDataLayout(); + + auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy( + *DAG.getContext(), TM.getTargetTriple(), DL, SincosStret); + + Type *SincosStretRetTy = FuncTy->getReturnType(); + CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret); + StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret); + + SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(), + TLI.getProgramPointerTy(DL)); + + TargetLowering::ArgListTy Args; + SDValue SRet; + + int FrameIdx; + if (FuncTy->getParamType(0)->isPointerTy()) { + // Uses sret + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + + AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0); + Type *StructTy = PtrAttrs.getStructRetType(); + const uint64_t ByteSize = DL.getTypeAllocSize(StructTy); + const Align StackAlign = DL.getPrefTypeAlign(StructTy); + + FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL)); + + TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0)); + Entry.IsSRet = true; + Entry.IndirectType = StructTy; + Entry.Alignment = StackAlign; + + Args.push_back(Entry); + Args.emplace_back(Arg, FuncTy->getParamType(1)); + } else { + Args.emplace_back(Arg, FuncTy->getParamType(0)); + } + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args)) + .setIsPostTypeLegalization(); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + + if (SRet) { + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo); + + TypeSize StoreSize = ArgVT.getStoreSize(); + + // Address of cos field. + SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize); + SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, + PtrInfo.getWithOffset(StoreSize)); + + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0), + LoadCos.getValue(0)); + } + + if (!CallResult.first.getValueType().isVector()) + return CallResult.first; + + SDValue SinVal = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, + DAG.getVectorIdxConstant(0, dl)); + SDValue CosVal = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, + DAG.getVectorIdxConstant(1, dl)); + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); +} + SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { SDLoc dl(Node); EVT VT = Node->getValueType(0); @@ -4730,6 +4827,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { case ISD::FSINCOS: case ISD::FSINCOSPI: { EVT VT = Node->getValueType(0); + + if (Node->getOpcode() == ISD::FSINCOS) { + RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT); + if (SincosStret != RTLIB::UNKNOWN_LIBCALL) { + if (SDValue Expanded = ExpandSincosStretLibCall(Node)) { + Results.push_back(Expanded); + Results.push_back(Expanded.getValue(1)); + break; + } + } + } + RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS ? RTLIB::getSINCOS(VT) : RTLIB::getSINCOSPI(VT); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 60aa61e993b26..d08f9b94227a2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - // Issue __sincos_stret if available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } else { - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f32, Expand); - } + // Issue __sincos_stret if available. + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Make floating-point constants legal for the large code model, so they don't // become loads from the constant pool. @@ -5346,35 +5340,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, return SDValue(); } -SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, - SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // which returns the values in two S / D registers. - SDLoc DL(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 - : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = getLibcallName(LC); - SDValue Callee = - DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); - - StructType *RetTy = StructType::get(ArgTy, ArgTy); - TargetLowering::CallLoweringInfo CLI(DAG); - CallingConv::ID CC = getLibcallCallingConv(LC); - CLI.setDebugLoc(DL) - .setChain(DAG.getEntryNode()) - .setLibCallee(CC, RetTy, Callee, std::move(Args)); - - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - return CallResult.first; -} - static MVT getSVEContainerType(EVT ContentTy); SDValue @@ -7723,8 +7688,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); - case ISD::FSINCOS: - return LowerFSINCOS(Op, DAG); case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); case ISD::SET_ROUNDING: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2cb8ed29f252a..70bfae717fb76 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -745,7 +745,6 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 6b0653457cbaf..92fae71121a81 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); } - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8Base()) { @@ -9855,76 +9855,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); } -SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // return values are passed via sret. - SDLoc dl(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); - RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC); - if (SincosStret == RTLIB::Unsupported) - return SDValue(); - - assert(Subtarget->isTargetDarwin()); - - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - - // Pair of floats / doubles used to pass the result. - Type *RetTy = StructType::get(ArgTy, ArgTy); - auto &DL = DAG.getDataLayout(); - - ArgListTy Args; - bool ShouldUseSRet = getTM().isAPCS_ABI(); - SDValue SRet; - if (ShouldUseSRet) { - // Create stack object for sret. - const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const Align StackAlign = DL.getPrefTypeAlign(RetTy); - int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); - SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); - - ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext())); - Entry.IsSExt = false; - Entry.IsZExt = false; - Entry.IsSRet = true; - Args.push_back(Entry); - RetTy = Type::getVoidTy(*DAG.getContext()); - } - - Args.emplace_back(Arg, ArgTy); - - StringRef LibcallName = getLibcallImplName(SincosStret); - CallingConv::ID CC = getLibcallImplCallingConv(SincosStret); - SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL)); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setCallee(CC, RetTy, Callee, std::move(Args)) - .setDiscardResult(ShouldUseSRet); - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - - if (!ShouldUseSRet) - return CallResult.first; - - SDValue LoadSin = - DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); - - // Address of cos field. - SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, - DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); - SDValue LoadCos = - DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); - - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, - LoadSin.getValue(0), LoadCos.getValue(0)); -} - SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const { @@ -10726,8 +10656,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECREDUCE_SMAX: return LowerVecReduceMinMax(Op, DAG, Subtarget); case ISD::ATOMIC_LOAD: - case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::ATOMIC_STORE: + return LowerAtomicLoadStore(Op, DAG); case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); case ISD::DYNAMIC_STACKALLOC: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index bf3438b0d8803..bc2fec3c1bdb5 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -901,7 +901,6 @@ class VectorType; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6edf0185df813..da2556978b39d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -33004,61 +33004,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - bool isF64 = ArgVT == MVT::f64; - - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); - if (!LibcallName) - return SDValue(); - - assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); - - // For MacOSX, we want to call an alternative entry point: __sincos_stret, - // which returns the values as { float, float } (in XMM0) or - // { double, double } (which is returned in XMM0, XMM1). - SDLoc dl(Op); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - TargetLowering::ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - // Only optimize x86_64 for now. i386 is a bit messy. For f32, - // the small struct {f32, f32} is returned in (eax, edx). For f64, - // the results are returned via SRet in memory. - SDValue Callee = - DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - - Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) - : (Type *)FixedVectorType::get(ArgTy, 2); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)) - .setIsPostTypeLegalization(); - - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - - if (isF64) - // Returned in xmm0 and xmm1. - return CallResult.first; - - // Returned in bits 0:31 and 32:64 xmm0. - SDValue SinVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(0, dl)); - SDValue CosVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(1, dl)); - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); -} - /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, @@ -33663,7 +33608,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ABDS: case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); From 2e8543c73555e26bba1c2dffab18288f33916b5d Mon Sep 17 00:00:00 2001 From: Sp00ph <61327188+Sp00ph@users.noreply.github.com> Date: Tue, 4 Nov 2025 19:32:55 +0100 Subject: [PATCH 210/313] [X86] Improve variable 8-bit shifts on AVX512BW (#164136) Previously, `clang -march=znver5 -O3` would emit the following for `shl`, `lshr` and `ashr <64 x i8>`: ```asm .LCPI0_2: .byte 8 .byte 4 .byte 2 .byte 1 .byte 0 .byte 0 .byte 0 .byte 0 .LCPI0_3: .byte 32 .byte 16 .byte 8 .byte 4 .byte 2 .byte 1 .byte 0 .byte 0 shl: vpsllw zmm1, zmm1, 5 vpmovb2m k1, zmm1 vpaddb zmm1, zmm1, zmm1 vgf2p8affineqb zmm0 {k1}, zmm0, qword ptr [rip + .LCPI0_2]{1to8}, 0 vpmovb2m k1, zmm1 vpaddb zmm1, zmm1, zmm1 vgf2p8affineqb zmm0 {k1}, zmm0, qword ptr [rip + .LCPI0_3]{1to8}, 0 vpmovb2m k1, zmm1 vpaddb zmm0 {k1}, zmm0, zmm0 ret .LCPI1_3: .byte 0 .byte 0 .byte 0 .byte 0 .byte 128 .byte 64 .byte 32 .byte 16 .LCPI1_4: .byte 0 .byte 0 .byte 128 .byte 64 .byte 32 .byte 16 .byte 8 .byte 4 .LCPI1_5: .byte 0 .byte 128 .byte 64 .byte 32 .byte 16 .byte 8 .byte 4 .byte 2 lshr: vpsllw zmm1, zmm1, 5 vpmovb2m k1, zmm1 vpaddb zmm1, zmm1, zmm1 vgf2p8affineqb zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_3]{1to8}, 0 vpmovb2m k1, zmm1 vpaddb zmm1, zmm1, zmm1 vgf2p8affineqb zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_4]{1to8}, 0 vpmovb2m k1, zmm1 vgf2p8affineqb zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_5]{1to8}, 0 ret ashr: vpsllw zmm1, zmm1, 5 vpunpckhbw zmm2, zmm0, zmm0 vpunpckhbw zmm4, zmm1, zmm1 vpsraw zmm3, zmm2, 4 vpunpcklbw zmm0, zmm0, zmm0 vpmovb2m k1, zmm4 vpaddw zmm4, zmm4, zmm4 vpunpcklbw zmm1, zmm1, zmm1 vmovdqu8 zmm2 {k1}, zmm3 vpmovb2m k1, zmm4 vpsraw zmm3, zmm2, 2 vpaddw zmm4, zmm4, zmm4 vmovdqu8 zmm2 {k1}, zmm3 vpsraw zmm3, zmm2, 1 vpmovb2m k1, zmm4 vmovdqu8 zmm2 {k1}, zmm3 vpmovb2m k1, zmm1 vpsraw zmm3, zmm0, 4 vpaddw zmm1, zmm1, zmm1 vpsrlw zmm2, zmm2, 8 vmovdqu8 zmm0 {k1}, zmm3 vpmovb2m k1, zmm1 vpsraw zmm3, zmm0, 2 vpaddw zmm1, zmm1, zmm1 vmovdqu8 zmm0 {k1}, zmm3 vpsraw zmm3, zmm0, 1 vpmovb2m k1, zmm1 vmovdqu8 zmm0 {k1}, zmm3 vpsrlw zmm0, zmm0, 8 vpackuswb zmm0, zmm0, zmm2 ret ``` With this commit, the generated assembly becomes this: ```asm .LCPI0_2: .byte 0 .byte 255 .byte 0 .byte 255 .LCPI0_3: .byte 255 .byte 0 .byte 255 .byte 0 shl: vpsrlw zmm2, zmm1, 8 vpandd zmm3, zmm0, dword ptr [rip + .LCPI0_2]{1to16} vpandd zmm1, zmm1, dword ptr [rip + .LCPI0_3]{1to16} movabs rax, -6148914691236517206 kmovq k1, rax vpsllvw zmm2, zmm3, zmm2 vpsllvw zmm0, zmm0, zmm1 vmovdqu8 zmm0 {k1}, zmm2 ret .LCPI1_0: .byte 255 .byte 0 lshr: vpbroadcastw zmm2, word ptr [rip + .LCPI1_0] movabs rax, -6148914691236517206 kmovq k1, rax vpandq zmm3, zmm1, zmm2 vpandq zmm2, zmm0, zmm2 vpsrlw zmm1, zmm1, 8 vpsrlvw zmm2, zmm2, zmm3 vpsrlvw zmm0, zmm0, zmm1 vmovdqu8 zmm2 {k1}, zmm0 vmovdqa64 zmm0, zmm2 ret .LCPI2_1: .byte 255 .byte 0 .byte 255 .byte 0 ashr: vpsrlw zmm2, zmm1, 8 vpandd zmm1, zmm1, dword ptr [rip + .LCPI2_1]{1to16} movabs rax, -6148914691236517206 vpsravw zmm2, zmm0, zmm2 vpsllw zmm0, zmm0, 8 kmovq k1, rax vpsraw zmm0, zmm0, 8 vpsravw zmm0, zmm0, zmm1 vmovdqu8 zmm0 {k1}, zmm2 ret ``` While I don't have AVX512 hardware, llvm-mca suggests significant speedups, and I've done some simple correctness tests on random inputs using the Intel Software Development Emulator. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 57 ++++++++++++++ llvm/test/CodeGen/X86/gfni-shifts.ll | 75 +++++++------------ .../test/CodeGen/X86/vector-shift-ashr-512.ll | 40 +++------- .../test/CodeGen/X86/vector-shift-lshr-512.ll | 22 ++---- llvm/test/CodeGen/X86/vector-shift-shl-512.ll | 19 ++--- 5 files changed, 109 insertions(+), 104 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index da2556978b39d..06b8f7614bffd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } + if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) { + // On AVX512BW, we can use variable 16-bit shifts to implement variable + // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi. + // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane + // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors + // can efficiently be merged together using a masked move. + MVT ExtVT = MVT::v32i16; + + SDValue RLo, RHi; + // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and + // right shifting AmtHi. + SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), + DAG.getConstant(0x00ff, dl, ExtVT)); + SDValue AmtHi = getTargetVShiftByConstNode( + X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG); + switch (Opc) { + case ISD::SHL: + // Because we shift left, no bits from the high half can influence the low + // half, so we don't need to mask RLo. We do however need to mask RHi, to + // prevent high bits of an even lane overflowing into low bits of an odd + // lane. + RLo = DAG.getBitcast(ExtVT, R); + RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo, + DAG.getConstant(0xff00, dl, ExtVT)); + break; + case ISD::SRL: + // Same idea as above, but this time we need to make sure no low bits of + // an odd lane can overflow into high bits of an even lane. + RHi = DAG.getBitcast(ExtVT, R); + RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi, + DAG.getConstant(0x00ff, dl, ExtVT)); + break; + case ISD::SRA: + // For arithmetic right shifts, we want to sign extend each even lane of R + // such that the upper half of the corresponding lane of RLo is 0 or -1 + // depending on the sign bit of the original lane. We do this using 2 + // immediate shifts. + RHi = DAG.getBitcast(ExtVT, R); + RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG); + RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG); + break; + default: + llvm_unreachable("Unexpected Shift Op"); + } + + SDValue ShiftedLo = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo)); + SDValue ShiftedHi = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi)); + + // To merge the shifted vectors back together, we select even lanes + // from ShiftedLo and odd lanes from ShiftedHi. + SDValue SelectMask = DAG.getBitcast( + MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64)); + return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index feac3dcad243a..30f1874c51fed 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_shl_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift @@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_lshr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_ashr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; GFNIAVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 0fb0420bb2609..aff2228c258b5 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index 103d5702fb93a..4450d07e01cca 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll index efd742956ed09..41238acc4b74d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift From 290ff955f07f44b5a9e0a03d405c60f794eb2e98 Mon Sep 17 00:00:00 2001 From: Florian Hahn <flo@fhahn.com> Date: Tue, 4 Nov 2025 18:34:13 +0000 Subject: [PATCH 211/313] [VPlan] Verify incoming values of VPIRPhi matches before checking (NFC) Update the verifier to first check if the number of incoming values matches the number of predecessors, before using incoming_values_and_blocks. We unfortunately need also check here, as this may be called before verifyPhiRecipes runs. Also update the verifier unit tests, to actually fail for the expected recipes. --- .../Transforms/Vectorize/VPlanVerifier.cpp | 7 +++ .../Vectorize/VPlanVerifierTest.cpp | 51 ++++++++++++++++--- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 91734a10cb2c8..34754a1ea3992 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { for (const VPUser *U : V->users()) { auto *UI = cast<VPRecipeBase>(U); + if (isa<VPIRPhi>(UI) && + UI->getNumOperands() != UI->getParent()->getNumPredecessors()) { + errs() << "Phi-like recipe with different number of operands and " + "predecessors.\n"; + return false; + } + if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) { for (const auto &[IncomingVPV, IncomingVPBB] : Phi->incoming_values_and_blocks()) { diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index 46802826fe090..169114ed6c310 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -326,22 +326,18 @@ TEST_F(VPVerifierTest, NonHeaderPHIInHeader) { class VPIRVerifierTest : public VPlanTestIRBase {}; -TEST_F(VPIRVerifierTest, testVerifyIRPhi) { +TEST_F(VPIRVerifierTest, testVerifyIRPhiInScalarHeaderVPIRBB) { const char *ModuleString = "define void @f(ptr %A, i64 %N) {\n" "entry:\n" " br label %loop\n" "loop:\n" " %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]\n" - " %arr.idx = getelementptr inbounds i32, ptr %A, i64 %iv\n" - " %l1 = load i32, ptr %arr.idx, align 4\n" - " %res = add i32 %l1, 10\n" - " store i32 %res, ptr %arr.idx, align 4\n" " %iv.next = add i64 %iv, 1\n" " %exitcond = icmp ne i64 %iv.next, %N\n" " br i1 %exitcond, label %loop, label %for.end\n" "for.end:\n" - " %p = phi i32 [ %l1, %loop ]\n" + " %p = phi i64 [ %iv, %loop ]\n" " ret void\n" "}\n"; @@ -351,7 +347,48 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhi) { BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); auto Plan = buildVPlan(LoopHeader); - Plan->getExitBlocks()[0]->front().addOperand(Plan->getConstantInt(32, 0)); +#if GTEST_HAS_STREAM_REDIRECTION + ::testing::internal::CaptureStderr(); +#endif + EXPECT_FALSE(verifyVPlanIsValid(*Plan)); +#if GTEST_HAS_STREAM_REDIRECTION + EXPECT_STREQ( + "Phi-like recipe with different number of operands and predecessors.\n", + ::testing::internal::GetCapturedStderr().c_str()); +#endif +} + +TEST_F(VPIRVerifierTest, testVerifyIRPhiInExitVPIRBB) { + const char *ModuleString = + "define void @f(ptr %A, i64 %N) {\n" + "entry:\n" + " br label %loop\n" + "loop:\n" + " %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]\n" + " %iv.next = add i64 %iv, 1\n" + " %exitcond = icmp ne i64 %iv.next, %N\n" + " br i1 %exitcond, label %loop, label %for.end\n" + "for.end:\n" + " %p = phi i64 [ %iv, %loop ]\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("f"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildVPlan(LoopHeader); + + // Create a definition in the vector loop header that will be used by the phi. + auto *HeaderBlock = + cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getEntry()); + VPInstruction *DefI = + new VPInstruction(VPInstruction::ExtractLastElement, + {HeaderBlock->front().getVPSingleValue()}); + DefI->insertBefore(Plan->getMiddleBlock()->getTerminator()); + Plan->getExitBlocks()[0]->front().addOperand(DefI); + VPValue *Zero = Plan->getConstantInt(32, 0); + Plan->getScalarHeader()->front().addOperand(Zero); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); From e29ee270615f3c8e3c9a0a657c0be43f9d76f441 Mon Sep 17 00:00:00 2001 From: Sam Clegg <sbc@chromium.org> Date: Tue, 4 Nov 2025 10:40:46 -0800 Subject: [PATCH 212/313] [lld][WebAssembly] Allow `--no-stack-first` in addition to `--stack-first` (#166384) This paves the way to make `--stack-first` the default. See: #151015 --- lld/test/wasm/stack-first.test | 25 +++++++++++++++++++++++++ lld/wasm/Driver.cpp | 2 +- lld/wasm/Options.td | 5 +++-- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/lld/test/wasm/stack-first.test b/lld/test/wasm/stack-first.test index 72e1a006d5700..46e8c6ebc2381 100644 --- a/lld/test/wasm/stack-first.test +++ b/lld/test/wasm/stack-first.test @@ -8,6 +8,15 @@ RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/stack-first. RUN: wasm-ld -z stack-size=512 --stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o RUN: obj2yaml %t.wasm | FileCheck %s +; Check `--no-stack-first` +RUN: wasm-ld -z stack-size=512 --stack-first --no-stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o +RUN: obj2yaml %t.wasm | FileCheck %s --check-prefix=NOT-FIRST + +; Check that the default is no-stack-first +RUN: wasm-ld -z stack-size=512 --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o +RUN: obj2yaml %t.wasm | FileCheck %s --check-prefix=NOT-FIRST + + CHECK: - Type: GLOBAL CHECK-NEXT: Globals: CHECK-NEXT: - Index: 0 @@ -51,3 +60,19 @@ CHECK-NEXT: Index: 2 CHECK-NEXT: - Name: __heap_base CHECK-NEXT: Kind: GLOBAL CHECK-NEXT: Index: 3 + +NOT-FIRST: - Type: GLOBAL +NOT-FIRST-NEXT: Globals: +NOT-FIRST-NEXT: - Index: 0 +NOT-FIRST-NEXT: Type: I32 +NOT-FIRST-NEXT: Mutable: true +NOT-FIRST-NEXT: InitExpr: +NOT-FIRST-NEXT: Opcode: I32_CONST +NOT-FIRST-NEXT: Value: 1552 +NOT-FIRST-NEXT: - Index: 1 +NOT-FIRST-NEXT: Type: I32 +NOT-FIRST-NEXT: Mutable: false +NOT-FIRST-NEXT: InitExpr: +NOT-FIRST-NEXT: Opcode: I32_CONST +NOT-FIRST-NEXT: Value: 1024 + diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 9c0e1b58e62f9..aad8095881bba 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -595,7 +595,7 @@ static void readConfigs(opt::InputArgList &args) { ctx.arg.shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck); ctx.arg.stripAll = args.hasArg(OPT_strip_all); ctx.arg.stripDebug = args.hasArg(OPT_strip_debug); - ctx.arg.stackFirst = args.hasArg(OPT_stack_first); + ctx.arg.stackFirst = args.hasFlag(OPT_stack_first, OPT_no_stack_first, false); ctx.arg.trace = args.hasArg(OPT_trace); ctx.arg.thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir); ctx.arg.thinLTOCachePolicy = CHECK( diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td index 2f699e2f68350..0ed69258d99aa 100644 --- a/lld/wasm/Options.td +++ b/lld/wasm/Options.td @@ -250,8 +250,9 @@ def no_entry: FF<"no-entry">, def no_shlib_sigcheck: FF<"no-shlib-sigcheck">, HelpText<"Do not check signatures of functions defined in shared libraries.">; -def stack_first: FF<"stack-first">, - HelpText<"Place stack at start of linear memory rather than after data">; +defm stack_first: B<"stack-first", + "Place stack at start of linear memory", + "Place the stack after static data refion (default)">; def table_base: JJ<"table-base=">, HelpText<"Table offset at which to place address taken functions (Defaults to 1)">; From 2bc22ea02edda5926f3e53f141def9bf212ac1db Mon Sep 17 00:00:00 2001 From: Mircea Trofin <mtrofin@google.com> Date: Tue, 4 Nov 2025 11:00:35 -0800 Subject: [PATCH 213/313] [NFC][SLU] Update SimpleLoopUnswitch/guards.ll (#166285) When running UTC on SLU/guards.ll (without LLVM changes), there are a number of changes in the UTC-generated checks. Submitting those first to simplify the diff of PR #164271, as most of the changes in the latter were actually these. --- .../Transforms/SimpleLoopUnswitch/guards.ll | 301 +++++++++++------- 1 file changed, 190 insertions(+), 111 deletions(-) diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll index 533b1f691f5ad..706b49df14749 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -S < %s | FileCheck %s ; RUN: opt -passes='simple-loop-unswitch<nontrivial>' -simple-loop-unswitch-guards -S < %s | FileCheck %s ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -verify-memoryssa -verify-loop-info -S < %s | FileCheck %s @@ -6,21 +6,30 @@ declare void @llvm.experimental.guard(i1, ...) define void @test_simple_case(i1 %cond, i32 %N) { -; CHECK-LABEL: @test_simple_case( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: entry.split.us: -; CHECK-NEXT: br label [[LOOP_US:%.*]] -; CHECK: loop.us: -; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US]] -; CHECK: guarded.us: +; CHECK-LABEL: define void @test_simple_case( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US]] +; CHECK: [[GUARDED_US]]: ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]] -; CHECK: deopt: +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: @@ -38,29 +47,44 @@ exit: } define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) { -; CHECK-LABEL: @test_two_guards( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: entry.split.us: -; CHECK-NEXT: br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]] -; CHECK: entry.split.us.split.us: -; CHECK-NEXT: br label [[LOOP_US_US:%.*]] -; CHECK: loop.us.us: -; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US_US:%.*]] -; CHECK: guarded.us.us: -; CHECK-NEXT: br label [[GUARDED_US2]] -; CHECK: guarded.us2: +; CHECK-LABEL: define void @test_two_guards( +; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label %[[ENTRY_SPLIT_US_SPLIT:.*]] +; CHECK: [[ENTRY_SPLIT_US_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US_US:.*]] +; CHECK: [[LOOP_US_US]]: +; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US2:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US_US:.*]] +; CHECK: [[GUARDED_US_US]]: +; CHECK-NEXT: br label %[[GUARDED_US2]] +; CHECK: [[GUARDED_US2]]: ; CHECK-NEXT: [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1 -; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]] -; CHECK: deopt1: +; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[EXIT_SPLIT_US_SPLIT_US:.*]] +; CHECK: [[EXIT_SPLIT_US_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[ENTRY_SPLIT_US_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: br label %[[GUARDED_US:.*]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: br label %[[DEOPT1:.*]] +; CHECK: [[DEOPT1]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: deopt: +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: exit: +; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -80,35 +104,46 @@ exit: } define void @test_conditional_guards(i1 %cond, i32 %N) { -; CHECK-LABEL: @test_conditional_guards( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[FROZEN:%.+]] = freeze i1 [[COND:%.*]] -; CHECK-NEXT: br i1 [[FROZEN]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: entry.split.us: -; CHECK-NEXT: br label [[LOOP_US:%.*]] -; CHECK: loop.us: -; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[BACKEDGE_US:%.*]] ] +; CHECK-LABEL: define void @test_conditional_guards( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[COND]] +; CHECK-NEXT: br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[BACKEDGE_US:.*]] ] ; CHECK-NEXT: [[CONDITION_US:%.*]] = icmp eq i32 [[IV_US]], 123 -; CHECK-NEXT: br i1 [[CONDITION_US]], label [[GUARD_US:%.*]], label [[BACKEDGE_US]] -; CHECK: guard.us: -; CHECK-NEXT: br label [[GUARDED_US:%.*]] -; CHECK: backedge.us: +; CHECK-NEXT: br i1 [[CONDITION_US]], label %[[GUARD_US:.*]], label %[[BACKEDGE_US]] +; CHECK: [[GUARD_US]]: +; CHECK-NEXT: br label %[[GUARDED_US:.*]] +; CHECK: [[BACKEDGE_US]]: ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: br label %[[BACKEDGE_US]] +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ] ; CHECK-NEXT: [[CONDITION:%.*]] = icmp eq i32 [[IV]], 123 -; CHECK-NEXT: br i1 [[CONDITION]], label [[GUARD:%.*]], label [[BACKEDGE]] -; CHECK: guard: -; CHECK-NEXT: br label [[DEOPT:%.*]] -; CHECK: deopt: +; CHECK-NEXT: br i1 [[CONDITION]], label %[[GUARD:.*]], label %[[BACKEDGE]] +; CHECK: [[GUARD]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: backedge: +; CHECK: [[BACKEDGE]]: ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label %loop, label [[EXIT_SPLIT:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT_SPLIT:.*]] +; CHECK: [[EXIT_SPLIT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: @@ -133,53 +168,54 @@ exit: } define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) { -; CHECK-LABEL: define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 %cond, label %entry.split, label %outer_loop.split -; CHECK: entry.split: -; CHECK-NEXT: br i1 %arg, label %entry.split.split.us, label %entry.split.split -; CHECK: entry.split.split.us: -; CHECK-NEXT: br label %outer_loop.us -; CHECK: outer_loop.us: -; CHECK-NEXT: br label %outer_loop.split.us.us -; CHECK: outer_backedge.us: -; CHECK-NEXT: br label %outer_loop.us -; CHECK: outer_loop.split.us.us: -; CHECK-NEXT: br label %loop.us.us -; CHECK: loop.us.us: -; CHECK-NEXT: %iv.us.us = phi i32 [ 0, %outer_loop.split.us.us ], [ %iv.next.us.us, %guarded.us.us ] -; CHECK-NEXT: br label %guarded.us.us -; CHECK: guarded.us.us: -; CHECK-NEXT: %iv.next.us.us = add i32 %iv.us.us, 1 -; CHECK-NEXT: %loop.cond.us.us = icmp slt i32 %iv.next.us.us, %N -; CHECK-NEXT: br i1 %loop.cond.us.us, label %loop.us.us, label %outer_backedge.split.us.us -; CHECK: outer_backedge.split.us.us: -; CHECK-NEXT: br label %outer_backedge.us -; CHECK: entry.split.split: -; CHECK-NEXT: br label %outer_loop -; CHECK: outer_loop: -; CHECK-NEXT: br label %outer_loop.split.us -; CHECK: outer_loop.split.us: -; CHECK-NEXT: br label %loop.us -; CHECK: loop.us: -; CHECK-NEXT: %iv.us = phi i32 [ 0, %outer_loop.split.us ], [ %iv.next.us, %guarded.us ] -; CHECK-NEXT: br label %guarded.us -; CHECK: guarded.us: -; CHECK-NEXT: %iv.next.us = add i32 %iv.us, 1 -; CHECK-NEXT: %loop.cond.us = icmp slt i32 %iv.next.us, %N -; CHECK-NEXT: br i1 %loop.cond.us, label %loop.us, label %outer_backedge.split.us -; CHECK: outer_backedge.split.us: -; CHECK-NEXT: br label %outer_backedge -; CHECK: outer_loop.split: -; CHECK-NEXT: br label %loop -; CHECK: loop: -; CHECK-NEXT: br label %deopt -; CHECK: deopt: +; CHECK-LABEL: define void @test_nested_loop( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]], i1 [[ARG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[OUTER_LOOP_SPLIT:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br i1 [[ARG]], label %[[ENTRY_SPLIT_SPLIT_US:.*]], label %[[ENTRY_SPLIT_SPLIT:.*]] +; CHECK: [[ENTRY_SPLIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_US:.*]] +; CHECK: [[OUTER_LOOP_US]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_SPLIT_US_US:.*]] +; CHECK: [[OUTER_BACKEDGE_US:.*]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_US]] +; CHECK: [[OUTER_LOOP_SPLIT_US_US]]: +; CHECK-NEXT: br label %[[LOOP_US_US:.*]] +; CHECK: [[LOOP_US_US]]: +; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US_US]] +; CHECK: [[GUARDED_US_US]]: +; CHECK-NEXT: [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[OUTER_BACKEDGE_SPLIT_US_US:.*]] +; CHECK: [[OUTER_BACKEDGE_SPLIT_US_US]]: +; CHECK-NEXT: br label %[[OUTER_BACKEDGE_US]] +; CHECK: [[ENTRY_SPLIT_SPLIT]]: +; CHECK-NEXT: br label %[[OUTER_LOOP:.*]] +; CHECK: [[OUTER_LOOP]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_SPLIT_US:.*]] +; CHECK: [[OUTER_LOOP_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[OUTER_BACKEDGE_SPLIT_US:.*]] +; CHECK: [[OUTER_BACKEDGE_SPLIT_US]]: +; CHECK-NEXT: br label %[[OUTER_BACKEDGE:.*]] +; CHECK: [[OUTER_LOOP_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: outer_backedge: -; CHECK-NEXT: br label %exit -; CHECK: exit: +; CHECK: [[OUTER_BACKEDGE]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -204,17 +240,50 @@ exit: } define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) { -; CHECK-LABEL: @test_sibling_loops( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: [[IV1_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], [[GUARDED_US:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US]] -; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-LABEL: define void @test_sibling_loops( +; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP1_US:.*]] +; CHECK: [[LOOP1_US]]: +; CHECK-NEXT: [[IV1_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], %[[GUARDED_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: [[IV1_NEXT_US]] = add i32 [[IV1_US]], 1 +; CHECK-NEXT: [[LOOP1_COND_US:%.*]] = icmp slt i32 [[IV1_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP1_COND_US]], label %[[LOOP1_US]], label %[[BETWEEN_SPLIT_US:.*]] +; CHECK: [[BETWEEN_SPLIT_US]]: +; CHECK-NEXT: br label %[[BETWEEN:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP1:.*]] +; CHECK: [[LOOP1]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: [[IV2_US:%.*]] = phi i32 [ 0, [[BETWEEN:%.*]] ], [ [[IV1_NEXT_US2:%.*]], [[GUARDED_US2:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US2]] -; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK: [[BETWEEN]]: +; CHECK-NEXT: br i1 [[COND2]], label %[[BETWEEN_SPLIT_US2:.*]], label %[[BETWEEN_SPLIT:.*]] +; CHECK: [[BETWEEN_SPLIT_US2]]: +; CHECK-NEXT: br label %[[LOOP2_US:.*]] +; CHECK: [[LOOP2_US]]: +; CHECK-NEXT: [[IV2_US:%.*]] = phi i32 [ 0, %[[BETWEEN_SPLIT_US2]] ], [ [[IV2_NEXT_US:%.*]], %[[GUARDED_US3:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US3]] +; CHECK: [[GUARDED_US3]]: +; CHECK-NEXT: [[IV2_NEXT_US]] = add i32 [[IV2_US]], 1 +; CHECK-NEXT: [[LOOP2_COND_US:%.*]] = icmp slt i32 [[IV2_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP2_COND_US]], label %[[LOOP2_US]], label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[BETWEEN_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP2:.*]] +; CHECK: [[LOOP2]]: +; CHECK-NEXT: br label %[[DEOPT1:.*]] +; CHECK: [[DEOPT1]]: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: @@ -242,11 +311,21 @@ exit: } ; Check that we don't do anything because of cleanuppad. -; CHECK-LABEL: @test_cleanuppad( -; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] -; CHECK-NOT: call void (i1, ...) @llvm.experimental.guard( define void @test_cleanuppad(i1 %cond, i32 %N) personality ptr @__CxxFrameHandler3 { - +; CHECK-LABEL: define void @test_cleanuppad( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) personality ptr @__CxxFrameHandler3 { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[COND]]) [ "deopt"() ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: invoke void @may_throw(i32 [[IV]]) +; CHECK-NEXT: to label %[[LOOP]] unwind label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[CP:%.*]] = cleanuppad within none [] +; CHECK-NEXT: cleanupret from [[CP]] unwind to caller +; entry: br label %loop From 7398591148f4351b38404304d8e2acb80651aaf3 Mon Sep 17 00:00:00 2001 From: Grigory Pastukhov <99913765+grigorypas@users.noreply.github.com> Date: Tue, 4 Nov 2025 11:01:50 -0800 Subject: [PATCH 214/313] [CodeGen] Add skipFunction() check to MachineFunctionSplitter (#166260) MachineFunctionSplitter was missing a skipFunction() check, causing it to incorrectly split functions that should be skipped (e.g., functions with optnone attribute). This patch adds an early skipFunction() check in runOnMachineFunction() to ensure these functions are never split, regardless of profile data availability or other splitting conditions. --- llvm/lib/CodeGen/MachineFunctionSplitter.cpp | 3 ++ .../machine-function-splitter-optnone.ll | 50 +++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp index c31454a8affda..b5d3092ee84d8 100644 --- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp +++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp @@ -129,6 +129,9 @@ static bool isColdBlock(const MachineBasicBlock &MBB, } bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + // Do not split functions when -basic-block-sections=all is specified. if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) return false; diff --git a/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll new file mode 100644 index 0000000000000..67d2ad72ee2f4 --- /dev/null +++ b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll @@ -0,0 +1,50 @@ +; REQUIRES: x86-registered-target + +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -O0 -mfs-psi-cutoff=0 -mfs-count-threshold=10000 | FileCheck %s + +;; Check that functions with optnone attribute are not split. +; CHECK-LABEL: foo_optnone: +; CHECK-NOT: .section .text.split.foo_optnone +; CHECK-NOT: foo_optnone.cold: +; CHECK: .LBB0_2: +; CHECK: .size foo_optnone + +define void @foo_optnone(i1 zeroext %0) nounwind optnone noinline !prof !14 !section_prefix !15 { +entry: + br i1 %0, label %hot, label %cold, !prof !17 + +hot: + %1 = call i32 @bar() + br label %exit + +cold: + %2 = call i32 @baz() + br label %exit + +exit: + %3 = tail call i32 @qux() + ret void +} + +declare i32 @bar() +declare i32 @baz() +declare i32 @qux() + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 5} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999900, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 7000} +!15 = !{!"function_section_prefix", !"hot"} +!17 = !{!"branch_weights", i32 7000, i32 0} From fe5c3cbbd02f201f96e2f55aebe5e0a1032c1594 Mon Sep 17 00:00:00 2001 From: yasmincs <ysarita@nvidia.com> Date: Tue, 4 Nov 2025 11:02:49 -0800 Subject: [PATCH 215/313] Added Conditions of SM90 and ISA7.8 for Using cvt.ftz.f32.bf16 Instruction (#165774) Updated the conditions for generating the cvt.ftz.f32.bf16 instruction to include sm90 and isa7.8, so that ftz is only generated when it is supported. --------- Co-authored-by: Justin Fargnoli <jfargnoli@nvidia.com> --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 2 +- llvm/test/CodeGen/NVPTX/bf16-instructions.ll | 343 +++++++++++++++++-- 2 files changed, 311 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b26022184708c..f0bdf472b96ed 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2267,7 +2267,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>; def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>; // fpextend bf16 -> f32 -def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ, hasPTX<78>, hasSM<90>]>; def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>; // fpextend f16 -> f64 diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index 4d930cd9e57c0..3626613cf8511 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %} @@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fadd( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; +; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fadd( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fsub( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; +; SM90-FTZ-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fsub( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_faddx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1]; +; SM90-FTZ-NEXT: add.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_faddx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fsubx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1]; +; SM90-FTZ-NEXT: sub.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fsubx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fmulx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1]; +; SM90-FTZ-NEXT: mul.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fmulx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fdiv( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<5>; +; SM90-FTZ-NEXT: .reg .b32 %r<8>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; +; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM90-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM90-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4; +; SM90-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r7; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fdiv( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<5>; @@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fpext_float( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fpext_float( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptrunc_float( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fptrunc_float_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptrunc_float( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r2, %r1, 0f3F800000; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fadd_imm_1( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; +; SM90-FTZ-NEXT: mov.b16 %rs2, 0x3F80; +; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fadd_imm_1( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -750,18 +857,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; ; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; ; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs8; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r6, %rs7; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r7, %rs6; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r8, %rs5; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r9, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r10, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r11, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r12, %rs1; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_extload_bf16x8( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<9>; +; SM90-FTZ-NEXT: .reg .b32 %r<13>; +; SM90-FTZ-NEXT: .reg .b64 %rd<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; +; SM90-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM90-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM90-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM90-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; +; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; +; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_extload_bf16x8( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<9>; @@ -825,12 +957,24 @@ define i16 @test_fptosi_i16(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %r1; ; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptosi_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptosi_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -880,12 +1024,24 @@ define i16 @test_fptoui_i16(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %r1; ; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptoui_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptoui_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -945,6 +1101,16 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_sitofp_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.s16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_sitofp_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1002,6 +1168,16 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i8( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i8( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1070,6 +1246,21 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i1( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .pred %p<2>; +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0]; +; SM90-FTZ-NEXT: and.b16 %rs2, %rs1, 1; +; SM90-FTZ-NEXT: setp.ne.b16 %p1, %rs2, 0; +; SM90-FTZ-NEXT: selp.b32 %r1, 1, 0, %p1; +; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs3, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i1( ; SM90: { ; SM90-NEXT: .reg .pred %p<2>; @@ -1132,6 +1323,16 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1188,6 +1389,17 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i32( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs1, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i32( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1248,6 +1460,17 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i64( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b64 %rd<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u64 %rs1, %rd1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i64( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1302,12 +1525,22 @@ define bfloat @test_roundeven(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rni.ftz.f32.f32 %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_roundeven( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; +; SM90-FTZ-NEXT: cvt.rni.bf16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_roundeven( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1372,6 +1605,17 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maximum( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; +; SM90-FTZ-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maximum( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -1430,6 +1674,17 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maxnum( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1]; +; SM90-FTZ-NEXT: max.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maxnum( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -1511,6 +1766,17 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maximum_v2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1]; +; SM90-FTZ-NEXT: max.NaN.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maximum_v2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -1583,6 +1849,17 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maxnum_v2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_1]; +; SM90-FTZ-NEXT: max.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maxnum_v2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; From 92a1eb37122fa24e3045fbabdea2bf87127cace5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Tue, 4 Nov 2025 11:07:57 -0800 Subject: [PATCH 216/313] AArch64: Regenerate cost model tests Broken by 831e79adff4506a0b22a770dcaa46bf5a37257cb, though presubmit was somehow green. --- llvm/test/Analysis/CostModel/AArch64/sincos.ll | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll index 72c8f2bbbf8cf..48537f6012dd5 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll @@ -38,14 +38,14 @@ define void @sincos() { ; ; SINCOS_STRET-LABEL: 'sincos' ; SINCOS_STRET: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) -; SINCOS_STRET: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) -; SINCOS_STRET: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) ; SINCOS_STRET: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) -; SINCOS_STRET: Cost Model: Found an estimated cost of 20 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) -; SINCOS_STRET: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) ; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) -; SINCOS_STRET: Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) ; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) ; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) ; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) From dc94f2cbadfd192fe3d43bd00fd5a1d0ead5ab8d Mon Sep 17 00:00:00 2001 From: Robert Imschweiler <robert.imschweiler@amd.com> Date: Tue, 4 Nov 2025 20:15:47 +0100 Subject: [PATCH 217/313] [Offload] Add device UID (#164391) Introduced in OpenMP 6.0, the device UID shall be a unique identifier of a device on a given system. (Not necessarily a UUID.) Since it is not guaranteed that the (U)UIDs defined by the device vendor libraries, such as HSA, do not overlap with those of other vendors, the device UIDs in offload are always combined with the offload plugin name. In case the vendor library does not specify any device UID for a given device, we fall back to the offload-internal device ID. The device UID can be retrieved using the `llvm-offload-device-info` tool. --- offload/liboffload/API/Device.td | 1 + offload/liboffload/src/OffloadImpl.cpp | 7 +++++-- .../amdgpu/dynamic_hsa/hsa_ext_amd.h | 1 + offload/plugins-nextgen/amdgpu/src/rtl.cpp | 14 +++++++++++++ .../common/include/PluginInterface.h | 19 ++++++++++++++++- .../common/src/PluginInterface.cpp | 21 ++++++++++++++++--- .../cuda/dynamic_cuda/cuda.cpp | 1 + .../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 4 ++++ offload/plugins-nextgen/cuda/src/rtl.cpp | 7 +++++++ .../deviceinfo/llvm-offload-device-info.cpp | 1 + .../OffloadAPI/device/olGetDeviceInfo.cpp | 20 ++++++++++++++++++ .../OffloadAPI/device/olGetDeviceInfoSize.cpp | 1 + 12 files changed, 91 insertions(+), 6 deletions(-) diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td index 5b54c79d83f9d..e9c154818c4a1 100644 --- a/offload/liboffload/API/Device.td +++ b/offload/liboffload/API/Device.td @@ -29,6 +29,7 @@ def ol_device_info_t : Enum { TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">, TaggedEtor<"NAME", "char[]", "Device name">, TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">, + TaggedEtor<"UID", "char[]", "Device UID">, TaggedEtor<"VENDOR", "char[]", "Device vendor">, TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">, TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">, diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 6d22faeb0e57e..84bc414396811 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -147,8 +147,8 @@ llvm::Error ol_platform_impl_t::init() { if (llvm::Error Err = Plugin->initDevice(Id)) return Err; - auto Device = &Plugin->getDevice(Id); - auto Info = Device->obtainInfoImpl(); + GenericDeviceTy *Device = &Plugin->getDevice(Id); + llvm::Expected<InfoTreeNode> Info = Device->obtainInfo(); if (llvm::Error Err = Info.takeError()) return Err; Devices.emplace_back(std::make_unique<ol_device_impl_t>(Id, Device, *this, @@ -467,6 +467,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, switch (PropName) { case OL_DEVICE_INFO_NAME: case OL_DEVICE_INFO_PRODUCT_NAME: + case OL_DEVICE_INFO_UID: case OL_DEVICE_INFO_VENDOR: case OL_DEVICE_INFO_DRIVER_VERSION: { // String values @@ -544,6 +545,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, return Info.writeString("Virtual Host Device"); case OL_DEVICE_INFO_PRODUCT_NAME: return Info.writeString("Virtual Host Device"); + case OL_DEVICE_INFO_UID: + return Info.writeString(GenericPluginTy::getHostDeviceUid()); case OL_DEVICE_INFO_VENDOR: return Info.writeString("Liboffload"); case OL_DEVICE_INFO_DRIVER_VERSION: diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h index 29cfe78082dbb..ddfa65c76cf2d 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h @@ -72,6 +72,7 @@ typedef enum hsa_amd_agent_info_s { HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A, HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B, HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010, + HSA_AMD_AGENT_INFO_UUID = 0xA011, HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016, } hsa_amd_agent_info_t; diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 0b03ef534d273..928c6cd7569e3 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2083,6 +2083,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; ComputeUnitKind = GPUName; + // From the ROCm HSA documentation: + // Query the UUID of the agent. The value is an Ascii string with a maximum + // of 21 chars including NUL. The string value consists of two parts: header + // and body. The header identifies the device type (GPU, CPU, DSP) while the + // body encodes the UUID as a 16 digit hex string. + // + // Agents that do not support UUID will return the string "GPU-XX" or + // "CPU-XX" or "DSP-XX" depending on their device type. + char UUID[24] = {0}; + if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_UUID, UUID)) + return Err; + if (!StringRef(UUID).ends_with("-XX")) + setDeviceUidFromVendorUid(UUID); + // Get the wavefront size. uint32_t WavefrontSize = 0; if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize)) diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index f9bff9abd903c..f9dcdea7213fd 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -791,6 +791,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// this id is not unique between different plugins; they may overlap. int32_t getDeviceId() const { return DeviceId; } + /// Get the unique identifier of the device. + const char *getDeviceUid() const { return DeviceUid.c_str(); } + /// Set the context of the device if needed, before calling device-specific /// functions. Plugins may implement this function as a no-op if not needed. virtual Error setContext() = 0; @@ -989,9 +992,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy { Error syncEvent(void *EventPtr); virtual Error syncEventImpl(void *EventPtr) = 0; + /// Obtain information about the device. + Expected<InfoTreeNode> obtainInfo(); + virtual Expected<InfoTreeNode> obtainInfoImpl() = 0; + /// Print information about the device. Error printInfo(); - virtual Expected<InfoTreeNode> obtainInfoImpl() = 0; /// Return true if the device has work that is either queued or currently /// running @@ -1204,6 +1210,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// global device id and is not the device id visible to the OpenMP user. const int32_t DeviceId; + /// The unique identifier of the device. + /// Per default, the unique identifier of the device is set to the device id, + /// combined with the plugin name, since the offload device id may overlap + /// between different plugins. + std::string DeviceUid; + /// Construct the device UID from the vendor (U)UID. + void setDeviceUidFromVendorUid(StringRef VendorUid); + /// The default grid values used for this device. llvm::omp::GV GridValues; @@ -1290,6 +1304,9 @@ struct GenericPluginTy { return UserDeviceIds.at(DeviceId); } + /// Get the UID for the host device. + static constexpr const char *getHostDeviceUid() { return "HOST"; } + /// Get the ELF code to recognize the binary image of this plugin. virtual uint16_t getMagicElfBits() const = 0; diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 36d643b65922d..d7e5a21600abf 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -715,6 +715,10 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, DeviceId(DeviceId), GridValues(OMPGridValues), PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(), PinnedAllocs(*this), RPCServer(nullptr) { + // Conservative fall-back to the plugin's device uid for the case that no real + // vendor (u)uid will become available later. + setDeviceUidFromVendorUid(std::to_string(static_cast<uint64_t>(DeviceId))); + #ifdef OMPT_SUPPORT OmptInitialized.store(false); // Bind the callbacks to this device's member functions @@ -1524,15 +1528,22 @@ Error GenericDeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData, return Err; } +Expected<InfoTreeNode> GenericDeviceTy::obtainInfo() { + auto InfoOrErr = obtainInfoImpl(); + if (InfoOrErr) + InfoOrErr->add("UID", getDeviceUid(), "", DeviceInfo::UID); + return InfoOrErr; +} + Error GenericDeviceTy::printInfo() { - auto Info = obtainInfoImpl(); + auto InfoOrErr = obtainInfo(); // Get the vendor-specific info entries describing the device properties. - if (auto Err = Info.takeError()) + if (auto Err = InfoOrErr.takeError()) return Err; // Print all info entries. - Info->print(); + InfoOrErr->print(); return Plugin::success(); } @@ -1603,6 +1614,10 @@ Expected<bool> GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) { return isAccessiblePtrImpl(Ptr, Size); } +void GenericDeviceTy::setDeviceUidFromVendorUid(StringRef VendorUid) { + DeviceUid = std::string(Plugin.getName()) + "-" + std::string(VendorUid); +} + Error GenericPluginTy::init() { if (Initialized) return Plugin::success(); diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index f5b2d074a47e7..e7a1ca38b3c13 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -35,6 +35,7 @@ DLWRAP(cuFuncSetAttribute, 3) // Device info DLWRAP(cuDeviceGetName, 3) +DLWRAP(cuDeviceGetUuid, 2) DLWRAP(cuDeviceTotalMem, 2) DLWRAP(cuDriverGetVersion, 1) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index dec4e33508c62..a470d6df1079d 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -33,6 +33,9 @@ typedef struct CUfunc_st *CUfunction; typedef void (*CUhostFn)(void *userData); typedef struct CUstream_st *CUstream; typedef struct CUevent_st *CUevent; +typedef struct CUuuid_st { + char bytes[16]; +} CUuuid; #define CU_DEVICE_INVALID ((CUdevice)(-2)) @@ -301,6 +304,7 @@ CUresult cuFuncSetAttribute(CUfunction, CUfunction_attribute, int); // Device info CUresult cuDeviceGetName(char *, int, CUdevice); +CUresult cuDeviceGetUuid(CUuuid *, CUdevice); CUresult cuDeviceTotalMem(size_t *, CUdevice); CUresult cuDriverGetVersion(int *); diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index db94f7f2dd995..a9adcc397fb7b 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -25,6 +25,7 @@ #include "PluginInterface.h" #include "Utils/ELF.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" @@ -293,6 +294,12 @@ struct CUDADeviceTy : public GenericDeviceTy { if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s")) return Err; + CUuuid UUID = {0}; + Res = cuDeviceGetUuid(&UUID, Device); + if (auto Err = Plugin::check(Res, "error in cuDeviceGetUuid: %s")) + return Err; + setDeviceUidFromVendorUid(toHex(UUID.bytes, true)); + // Query the current flags of the primary context and set its flags if // it is inactive. unsigned int FormerPrimaryCtxFlags = 0; diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp index 9b58d67f017ca..42ffb97d6d77c 100644 --- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp +++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp @@ -176,6 +176,7 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) { printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_NAME, "Name")); OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_PRODUCT_NAME, "Product Name")); + OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_UID, "UID")); OFFLOAD_ERR( printDeviceValue<ol_device_type_t>(S, D, OL_DEVICE_INFO_TYPE, "Type")); OFFLOAD_ERR(printDeviceValue<const char *>( diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp index 8cb0b8065c33e..30eafee026316 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp @@ -98,6 +98,16 @@ TEST_P(olGetDeviceInfoTest, SuccessProductName) { ASSERT_EQ(std::strlen(Name.data()), Size - 1); } +TEST_P(olGetDeviceInfoTest, SuccessUID) { + size_t Size = 0; + ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_UID, &Size)); + ASSERT_GT(Size, 0ul); + std::vector<char> UID; + UID.resize(Size); + ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_UID, Size, UID.data())); + ASSERT_EQ(std::strlen(UID.data()), Size - 1); +} + TEST_P(olGetDeviceInfoTest, HostProductName) { size_t Size = 0; ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size)); @@ -109,6 +119,16 @@ TEST_P(olGetDeviceInfoTest, HostProductName) { ASSERT_EQ(std::strlen(Name.data()), Size - 1); } +TEST_P(olGetDeviceInfoTest, HostUID) { + size_t Size = 0; + ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_UID, &Size)); + ASSERT_GT(Size, 0ul); + std::vector<char> UID; + UID.resize(Size); + ASSERT_SUCCESS(olGetDeviceInfo(Host, OL_DEVICE_INFO_UID, Size, UID.data())); + ASSERT_EQ(std::strlen(UID.data()), Size - 1); +} + TEST_P(olGetDeviceInfoTest, SuccessVendor) { size_t Size = 0; ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size)); diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp index c4a3c2d5e3c75..79a18c1d133dc 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp @@ -32,6 +32,7 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t, OL_DEVICE_INFO_PLATFORM); OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME); OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(UID, OL_DEVICE_INFO_UID); OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR); OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION); OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t, From e5f191e171720b413f83bff13b61d500369f560d Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Tue, 4 Nov 2025 11:21:32 -0800 Subject: [PATCH 218/313] DeclareRuntimeLibcalls: Add registered target checks to test run lines --- .../Util/DeclareRuntimeLibcalls/sincos_stret.ll | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll index 0d0e3da25eea7..f0f09e97d9dba 100644 --- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll @@ -1,11 +1,11 @@ -; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s -; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s -; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s -; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s -; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s +; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s %} +; RUN: %if aarch64-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %} +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %} +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %} +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %} -; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s -; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s %} +; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s %} ; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]] ; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]] From 5be12e1c95e897a9da713d49255868eea7ad60fa Mon Sep 17 00:00:00 2001 From: nerix <nerixdev@outlook.de> Date: Tue, 4 Nov 2025 20:23:33 +0100 Subject: [PATCH 219/313] [LLDB] Run API tests with PDB too (#149305) From https://github.com/llvm/llvm-project/pull/148554#issuecomment-3083261858 - this adds an option for API tests to be run with the both PDB readers on Windows. As there are a lot of failures with PDB, this is an opt-in per test. To get PDB, `-g -gcodeview` has to be used on Clang. `-gcodeview` alone isn't enough, because it won't cause clang to pass `-debug` to the linker. #149498 tracks the (currently) failing tests. --- .../Python/lldbsuite/test/builders/builder.py | 1 + .../packages/Python/lldbsuite/test/lldbtest.py | 12 ++++++++++++ .../Python/lldbsuite/test/make/Makefile.rules | 4 ++++ .../Python/lldbsuite/test/test_categories.py | 11 ++++++++++- lldb/test/API/test_utils/pdb/Makefile | 3 +++ lldb/test/API/test_utils/pdb/TestPdb.py | 18 ++++++++++++++++++ lldb/test/API/test_utils/pdb/main.cpp | 1 + 7 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 lldb/test/API/test_utils/pdb/Makefile create mode 100644 lldb/test/API/test_utils/pdb/TestPdb.py create mode 100644 lldb/test/API/test_utils/pdb/main.cpp diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py index 96c7b3987d8a1..024c9f1c7e435 100644 --- a/lldb/packages/Python/lldbsuite/test/builders/builder.py +++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py @@ -258,6 +258,7 @@ def _getDebugInfoArgs(self, debug_info): "gmodules": {"MAKE_DSYM": "NO", "MAKE_GMODULES": "YES"}, "debug_names": {"MAKE_DEBUG_NAMES": "YES"}, "dwp": {"MAKE_DSYM": "NO", "MAKE_DWP": "YES"}, + "pdb": {"MAKE_PDB": "YES"}, } # Collect all flags, with later options overriding earlier ones diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index b92de941c4124..8c1eea97620e2 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -1791,6 +1791,11 @@ def no_reason(_): if can_replicate ] + # PDB is off by default, because it has a lot of failures right now. + # See llvm.org/pr149498 + if original_testcase.TEST_WITH_PDB_DEBUG_INFO: + dbginfo_categories.append("pdb") + xfail_for_debug_info_cat_fn = getattr( attrvalue, "__xfail_for_debug_info_cat_fn__", no_reason ) @@ -1878,6 +1883,13 @@ class TestBase(Base, metaclass=LLDBTestCaseFactory): # test multiple times with various debug info types. NO_DEBUG_INFO_TESTCASE = False + TEST_WITH_PDB_DEBUG_INFO = False + """ + Subclasses can set this to True to test with PDB in addition to the other debug info + types. This id off by default because many tests will fail due to missing functionality in PDB. + See llvm.org/pr149498. + """ + def generateSource(self, source): template = source + ".template" temp = os.path.join(self.getSourceDir(), template) diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules index 28cae54776ac8..b3822db162a0b 100644 --- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules +++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules @@ -249,6 +249,10 @@ ifeq ($(CC_TYPE), clang) MODULE_DEBUG_INFO_FLAGS += -gmodules endif +ifeq "$(MAKE_PDB)" "YES" + DEBUG_INFO_FLAG ?= -g -gcodeview +endif + # If the OS is Windows, we need to pass -gdwarf to clang, otherwise it will build # with codeview by default but all the tests rely on dwarf. ifeq "$(OS)" "Windows_NT" diff --git a/lldb/packages/Python/lldbsuite/test/test_categories.py b/lldb/packages/Python/lldbsuite/test/test_categories.py index 1f6e8a78e0c0d..b8a764fb3349a 100644 --- a/lldb/packages/Python/lldbsuite/test/test_categories.py +++ b/lldb/packages/Python/lldbsuite/test/test_categories.py @@ -12,7 +12,13 @@ # Key: Category name # Value: should be used in lldbtest's debug-info replication -debug_info_categories = {"dwarf": True, "dwo": True, "dsym": True, "gmodules": False} +debug_info_categories = { + "dwarf": True, + "dwo": True, + "dsym": True, + "pdb": False, + "gmodules": False, +} all_categories = { "basic_process": "Basic process execution sniff tests.", @@ -34,6 +40,7 @@ "lldb-dap": "Tests for the Debug Adapter Protocol with lldb-dap", "llgs": "Tests for the gdb-server functionality of lldb-server", "msvcstl": "Test for MSVC STL data formatters", + "pdb": "Tests that can be run with PDB debug information", "pexpect": "Tests requiring the pexpect library to be available", "objc": "Tests related to the Objective-C programming language support", "pyapi": "Tests related to the Python API", @@ -65,6 +72,8 @@ def is_supported_on_platform(category, platform, compiler_path): if platform not in ["darwin", "macosx", "ios", "watchos", "tvos", "bridgeos"]: return False return gmodules.is_compiler_clang_with_gmodules(compiler_path) + elif category == "pdb": + return platform == "windows" return True diff --git a/lldb/test/API/test_utils/pdb/Makefile b/lldb/test/API/test_utils/pdb/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/test_utils/pdb/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/test_utils/pdb/TestPdb.py b/lldb/test/API/test_utils/pdb/TestPdb.py new file mode 100644 index 0000000000000..bd3a9d0c34ab3 --- /dev/null +++ b/lldb/test/API/test_utils/pdb/TestPdb.py @@ -0,0 +1,18 @@ +""" +Test PDB enabled tests +""" + +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * + + +class TestBuildMethod(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + + def test(self): + self.build() + self.assertTrue(self.dbg.CreateTarget(self.getBuildArtifact())) + if self.getDebugInfo() == "pdb": + self.expect( + "target modules dump symfile", patterns=["SymbolFile (native-)?pdb"] + ) diff --git a/lldb/test/API/test_utils/pdb/main.cpp b/lldb/test/API/test_utils/pdb/main.cpp new file mode 100644 index 0000000000000..76e8197013aab --- /dev/null +++ b/lldb/test/API/test_utils/pdb/main.cpp @@ -0,0 +1 @@ +int main() { return 0; } From 0ae0ac0f004d4028051ba01335b02ebadd2b8b4d Mon Sep 17 00:00:00 2001 From: Ross Burton <ross.burton@arm.com> Date: Tue, 4 Nov 2025 19:26:02 +0000 Subject: [PATCH 220/313] [cmake] Pass PYTHON_EXECUTABLE to native builds (#163574) Ensure that the nested native build uses the same python interpreter as the main build, in case the python that CMake detects first is not the python that the user has specified explicitly. For example, if the person building LLVM wants to use a different python interpreter to build (eg, testing the build with `python3.14` when `python3` is a link to `python3.8`, or the default python doesn't have development headers available) then they could add `-DPYTHON_EXECUTABLE=python3.14` when invoking CMake. This should be forwarded to the native CMake build to ensure that the same python is used. Original fix by Anuj Mittal <anuj.mittal@intel.com>. --- llvm/cmake/modules/CrossCompile.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake index bfbd9cfd4063f..2a69c5133c56f 100644 --- a/llvm/cmake/modules/CrossCompile.cmake +++ b/llvm/cmake/modules/CrossCompile.cmake @@ -101,6 +101,7 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype) -DLLVM_INCLUDE_BENCHMARKS=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_TABLEGEN_FLAGS="${LLVM_TABLEGEN_FLAGS}" + -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" ${build_type_flags} ${linker_flag} ${external_clang_dir} ${libc_flags} ${ARGN} WORKING_DIRECTORY ${${project_name}_${target_name}_BUILD} From 4776451693f4a6bd18e50106edb4b3cfa766484f Mon Sep 17 00:00:00 2001 From: Aiden Grossman <aidengrossman@google.com> Date: Tue, 4 Nov 2025 11:27:38 -0800 Subject: [PATCH 221/313] [X86][NewPM] Port lower-amx-intrinsics to NewPM Reviewers: paperchalice, phoebewang, arsenm Reviewed By: arsenm Pull Request: https://github.com/llvm/llvm-project/pull/165113 --- llvm/lib/Target/X86/X86.h | 13 ++++- llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp | 48 +++++++++++++++---- llvm/lib/Target/X86/X86PassRegistry.def | 2 +- llvm/lib/Target/X86/X86TargetMachine.cpp | 2 +- .../AMX/amx-low-intrinsics-no-amx-bitcast.ll | 3 +- .../CodeGen/X86/AMX/amx-low-intrinsics.ll | 3 +- 6 files changed, 56 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 51b540a7a51d0..bdb43cfb4adb4 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -179,7 +179,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass(); /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. -FunctionPass *createX86LowerAMXIntrinsicsPass(); +class X86LowerAMXIntrinsicsPass + : public PassInfoMixin<X86LowerAMXIntrinsicsPass> { +private: + const TargetMachine *TM; + +public: + X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86LowerAMXIntrinsicsLegacyPass(); InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &, diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 7f3393910da2c..662aec2c15241 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -23,12 +23,15 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -40,7 +43,7 @@ using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "lower-amx-intrinsics" +#define DEBUG_TYPE "x86-lower-amx-intrinsics" #ifndef NDEBUG static bool isV256I32Ty(Type *Ty) { @@ -626,6 +629,37 @@ bool X86LowerAMXIntrinsics::visit() { return C; } +namespace { +bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) { + return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) || + TM->getOptLevel() == CodeGenOptLevel::None); +} + +bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + X86LowerAMXIntrinsics LAT(F, DTU, LI); + return LAT.visit(); +} +} // namespace + +PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &FAM) { + if (!shouldRunLowerAMXIntrinsics(F, TM)) + return PreservedAnalyses::all(); + + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); + bool Changed = runLowerAMXIntrinsics(F, &DT, &LI); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LoopAnalysis>(); + return PA; +} + namespace { class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { public: @@ -634,21 +668,15 @@ class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { - if (!X86ScalarizeAMX) - return false; TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); - if (!F.hasFnAttribute(Attribute::OptimizeNone) && - TM->getOptLevel() != CodeGenOptLevel::None) + if (!shouldRunLowerAMXIntrinsics(F, TM)) return false; auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - - X86LowerAMXIntrinsics LAT(F, DTU, LI); - return LAT.visit(); + return runLowerAMXIntrinsics(F, DT, LI); } StringRef getPassName() const override { return "Lower AMX intrinsics"; } @@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) -FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() { +FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() { return new X86LowerAMXIntrinsicsLegacyPass(); } diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index fc25d55d3059a..81c98febc4ba8 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -15,13 +15,13 @@ #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif +FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this)) FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this)) #undef FUNCTION_PASS #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) #endif -DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) #undef DUMMY_FUNCTION_PASS diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9a76abcd351bf..bf4dab0371b88 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -422,7 +422,7 @@ void X86PassConfig::addIRPasses() { // We add both pass anyway and when these two passes run, we skip the pass // based on the option level and option attribute. - addPass(createX86LowerAMXIntrinsicsPass()); + addPass(createX86LowerAMXIntrinsicsLegacyPass()); addPass(createX86LowerAMXTypeLegacyPass()); TargetPassConfig::addIRPasses(); diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll index 87059c5d474e6..6ae7b2260c15c 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 { ; CHECK-LABEL: @test_no_bitcast( diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll index 5fb2dcdc1d621..ca7c3573a3294 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) { ; CHECK-LABEL: @test_amx_load_non_O0( From e03fac127c8db8a988c8346295f9862dd59c42fc Mon Sep 17 00:00:00 2001 From: Aiden Grossman <aidengrossman@google.com> Date: Tue, 4 Nov 2025 11:28:27 -0800 Subject: [PATCH 222/313] [X86][NewPM] Port X86PartialReduction to NewPM There are no tests that specifically stop/start at x86-partial-reduction, so no test cases have been updated for this patch. Reviewers: phoebewang, paperchalice, RKSimon, arsenm Reviewed By: arsenm, RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/166048 --- llvm/lib/Target/X86/X86.h | 13 +++- llvm/lib/Target/X86/X86PartialReduction.cpp | 72 +++++++++++++-------- llvm/lib/Target/X86/X86PassRegistry.def | 2 +- llvm/lib/Target/X86/X86TargetMachine.cpp | 4 +- 4 files changed, 60 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index bdb43cfb4adb4..fa23656e23fc3 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -158,7 +158,16 @@ FunctionPass *createX86InsertX87waitPass(); /// This pass optimizes arithmetic based on knowledge that is only used by /// a reduction sequence and is therefore safe to reassociate in interesting /// ways. -FunctionPass *createX86PartialReductionPass(); +class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> { +private: + const X86TargetMachine *TM; + +public: + X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +FunctionPass *createX86PartialReductionLegacyPass(); /// // Analyzes and emits pseudos to support Win x64 Unwind V2. FunctionPass *createX86WinEHUnwindV2Pass(); @@ -231,7 +240,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); -void initializeX86PartialReductionPass(PassRegistry &); +void initializeX86PartialReductionLegacyPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); void initializeX86ReturnThunksPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index a25e4e0f464a4..898c83cf9b468 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -16,10 +16,12 @@ #include "X86TargetMachine.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -30,39 +32,44 @@ using namespace llvm; namespace { -class X86PartialReduction : public FunctionPass { +class X86PartialReduction { + const X86TargetMachine *TM; const DataLayout *DL = nullptr; const X86Subtarget *ST = nullptr; +public: + X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {} + bool run(Function &F); + +private: + bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); + bool trySADReplacement(Instruction *Op); +}; + +class X86PartialReductionLegacy : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - X86PartialReduction() : FunctionPass(ID) { } + X86PartialReductionLegacy() : FunctionPass(ID) {} - bool runOnFunction(Function &Fn) override; + bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } - StringRef getPassName() const override { - return "X86 Partial Reduction"; - } - -private: - bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); - bool trySADReplacement(Instruction *Op); + StringRef getPassName() const override { return "X86 Partial Reduction"; } }; } -FunctionPass *llvm::createX86PartialReductionPass() { - return new X86PartialReduction(); +FunctionPass *llvm::createX86PartialReductionLegacyPass() { + return new X86PartialReductionLegacy(); } -char X86PartialReduction::ID = 0; +char X86PartialReductionLegacy::ID = 0; -INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, - "X86 Partial Reduction", false, false) +INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction", + false, false) // This function should be aligned with detectExtMul() in X86ISelLowering.cpp. static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul, @@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { } } -bool X86PartialReduction::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - auto &TM = TPC->getTM<X86TargetMachine>(); - ST = TM.getSubtargetImpl(F); - +bool X86PartialReduction::run(Function &F) { + ST = TM->getSubtargetImpl(F); DL = &F.getDataLayout(); bool MadeChange = false; @@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) { return MadeChange; } + +bool X86PartialReductionLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F); +} + +PreservedAnalyses X86PartialReductionPass::run(Function &F, + FunctionAnalysisManager &FAM) { + bool Changed = X86PartialReduction(TM).run(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index 81c98febc4ba8..db255940f8829 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -17,12 +17,12 @@ #endif FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this)) FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this)) +FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this)) #undef FUNCTION_PASS #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) #endif -DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) #undef DUMMY_FUNCTION_PASS diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index bf4dab0371b88..5f0bcab251e61 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -97,7 +97,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeX86LoadValueInjectionLoadHardeningPassPass(PR); initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); - initializeX86PartialReductionPass(PR); + initializeX86PartialReductionLegacyPass(PR); initializePseudoProbeInserterPass(PR); initializeX86ReturnThunksPass(PR); initializeX86DAGToDAGISelLegacyPass(PR); @@ -429,7 +429,7 @@ void X86PassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createInterleavedAccessPass()); - addPass(createX86PartialReductionPass()); + addPass(createX86PartialReductionLegacyPass()); } // Add passes that handle indirect branch removal and insertion of a retpoline From 7272a6c8882d99fe1fc73d2c69ddf976948f0e50 Mon Sep 17 00:00:00 2001 From: yonghong-song <yhs@fb.com> Date: Tue, 4 Nov 2025 11:32:12 -0800 Subject: [PATCH 223/313] [BPF] Avoid relocation for jumptable entries (#166301) Currently, the jump table entry contains labels only. For example, the following is one example: BPF.JT.0.0: .quad LBB0_1 .quad LBB0_2 .size BPF.JT.0.0, 16 Since the jump table entry contains a label, the relocation is necessary so linker can resolve the label value. The relocation looks like below: Relocation section '.rel.jumptables' at offset 0x160 contains 2 entries: Offset Info Type Symbol's Value Symbol's Name 0000000000000000 0000000200000002 R_BPF_64_ABS64 0000000000000000 .text 0000000000000008 0000000200000002 R_BPF_64_ABS64 0000000000000000 .text You can see that the symbol value is 0 which makes .rel.jumptables not very useful. Instead of having the label itself in the jump table entry, use the difference of label and the section begin symbol. This can avoid the relocation and the eventual jumptable entries in object file remains the same as before. Hex dump of section '.jumptables': 0x00000000 68000000 00000000 78000000 00000000 h.......x....... --- llvm/lib/Target/BPF/BPFAsmPrinter.cpp | 10 +++- llvm/test/CodeGen/BPF/jump_table_blockaddr.ll | 4 +- .../test/CodeGen/BPF/jump_table_global_var.ll | 4 +- .../CodeGen/BPF/jump_table_switch_stmt.ll | 60 +++++++++---------- 4 files changed, 42 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index b2a82040ee823..378a72ab27dd5 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -219,6 +219,10 @@ void BPFAsmPrinter::emitJumpTableInfo() { const TargetLoweringObjectFile &TLOF = getObjFileLowering(); const Function &F = MF->getFunction(); + + MCSection *Sec = OutStreamer->getCurrentSectionOnly(); + MCSymbol *SecStart = Sec->getBeginSymbol(); + MCSection *JTS = TLOF.getSectionForJumpTable(F, TM); assert(MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress); unsigned EntrySize = MJTI->getEntrySize(getDataLayout()); @@ -231,8 +235,10 @@ void BPFAsmPrinter::emitJumpTableInfo() { MCSymbol *JTStart = getJTPublicSymbol(JTI); OutStreamer->emitLabel(JTStart); for (const MachineBasicBlock *MBB : JTBBs) { - const MCExpr *LHS = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); - OutStreamer->emitValue(LHS, EntrySize); + const MCExpr *Diff = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(MBB->getSymbol(), OutContext), + MCSymbolRefExpr::create(SecStart, OutContext), OutContext); + OutStreamer->emitValue(Diff, EntrySize); } const MCExpr *JTSize = MCConstantExpr::create(JTBBs.size() * EntrySize, OutContext); diff --git a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll index d5a1d63b644a8..b7d518639d70e 100644 --- a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll +++ b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll @@ -84,8 +84,8 @@ llc -march=bpf -mcpu=v4 < test.ll \ ; CHECK: .cfi_endproc ; CHECK: .section .jumptables,"",@progbits ; CHECK: BPF.JT.0.0: -; CHECK: .quad LBB0_3 +; CHECK: .quad LBB0_3-.text ; CHECK: .size BPF.JT.0.0, 8 ; CHECK: BPF.JT.0.1: -; CHECK: .quad LBB0_4 +; CHECK: .quad LBB0_4-.text ; CHECK: .size BPF.JT.0.1, 8 diff --git a/llvm/test/CodeGen/BPF/jump_table_global_var.ll b/llvm/test/CodeGen/BPF/jump_table_global_var.ll index bbca46850843b..71c682f5530ed 100644 --- a/llvm/test/CodeGen/BPF/jump_table_global_var.ll +++ b/llvm/test/CodeGen/BPF/jump_table_global_var.ll @@ -78,6 +78,6 @@ llc -march=bpf -mcpu=v4 < test.ll \ ; CHECK: .cfi_endproc ; CHECK: .section .jumptables,"",@progbits ; CHECK: BPF.JT.0.0: -; CHECK: .quad LBB0_1 -; CHECK: .quad LBB0_2 +; CHECK: .quad LBB0_1-.text +; CHECK: .quad LBB0_2-.text ; CHECK: .size BPF.JT.0.0, 16 diff --git a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll index 682b025d665d6..eb1e5bff11013 100644 --- a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll +++ b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll @@ -93,34 +93,34 @@ llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \ ; CHECK: .cfi_endproc ; CHECK: .section .jumptables,"",@progbits ; CHECK: BPF.JT.0.0: -; CHECK: .quad LBB0_4 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_2 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_3 +; CHECK: .quad LBB0_4-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_2-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_3-.text ; CHECK: .size BPF.JT.0.0, 240 From a99e13271e25269f0feb7b256676594d315fa381 Mon Sep 17 00:00:00 2001 From: Baranov Victor <bar.victor.2002@gmail.com> Date: Tue, 4 Nov 2025 22:32:43 +0300 Subject: [PATCH 224/313] [Docs][clang-tools-extra] Convert maintainers list to .rst format (#165171) [Clang maintainers list](https://github.com/llvm/llvm-project/blob/main/clang/Maintainers.rst) is already in `.rst` format, which gives nice visuals. I think we should convert clang-tools-extra maintainers too to `.rst`. --- clang-tools-extra/{Maintainers.txt => Maintainers.rst} | 8 ++++++-- clang-tools-extra/docs/Maintainers.rst | 1 + clang-tools-extra/docs/index.rst | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) rename clang-tools-extra/{Maintainers.txt => Maintainers.rst} (91%) create mode 100644 clang-tools-extra/docs/Maintainers.rst diff --git a/clang-tools-extra/Maintainers.txt b/clang-tools-extra/Maintainers.rst similarity index 91% rename from clang-tools-extra/Maintainers.txt rename to clang-tools-extra/Maintainers.rst index 43dfd48ad1f57..2603ebadf529c 100644 --- a/clang-tools-extra/Maintainers.txt +++ b/clang-tools-extra/Maintainers.rst @@ -2,9 +2,13 @@ Clang Tools Extra Maintainers ============================= -This file is a list of the maintainers -(https://llvm.org/docs/DeveloperPolicy.html#maintainers) for clang-tools-extra. +This file is a list of the +`maintainers <https://llvm.org/docs/DeveloperPolicy.html#maintainers>`_ +for `Extra Clang Tools <https://clang.llvm.org/extra/index.html>`_ project. +.. contents:: + :depth: 2 + :local: Active Maintainers ================== diff --git a/clang-tools-extra/docs/Maintainers.rst b/clang-tools-extra/docs/Maintainers.rst new file mode 100644 index 0000000000000..f78e9ecf279a6 --- /dev/null +++ b/clang-tools-extra/docs/Maintainers.rst @@ -0,0 +1 @@ +.. include:: ../Maintainers.rst \ No newline at end of file diff --git a/clang-tools-extra/docs/index.rst b/clang-tools-extra/docs/index.rst index 3f3a99d1b70c6..eba4a2cdbc558 100644 --- a/clang-tools-extra/docs/index.rst +++ b/clang-tools-extra/docs/index.rst @@ -22,6 +22,7 @@ Contents pp-trace clangd <https://clangd.llvm.org/> clang-doc + Maintainers Doxygen Documentation From 8f683c3e4b6fe939a7d0f1167934aa823a889267 Mon Sep 17 00:00:00 2001 From: choikwa <5455710+choikwa@users.noreply.github.com> Date: Tue, 4 Nov 2025 14:34:54 -0500 Subject: [PATCH 225/313] [AMDGPU] NFC, delete promote-alloca testcase (#166297) previous merge did not delete. --- .../AMDGPU/promote-alloca-array-to-vector.ll | 325 ------------------ 1 file changed, 325 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll deleted file mode 100644 index 05a0e39d4a715..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll +++ /dev/null @@ -1,325 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s - -define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 { -; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users( -; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; OPT-NEXT: [[ENTRY:.*:]] -; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison -; OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0 -; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1 -; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2 -; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3 -; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4 -; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5 -; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6 -; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7 -; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8 -; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9 -; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10 -; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11 -; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12 -; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13 -; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14 -; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15 -; OPT-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0 -; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1 -; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2 -; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3 -; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4 -; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5 -; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6 -; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7 -; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8 -; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9 -; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10 -; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11 -; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12 -; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13 -; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14 -; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15 -; OPT-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0 -; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1 -; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2 -; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3 -; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4 -; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5 -; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6 -; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7 -; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8 -; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9 -; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10 -; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11 -; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12 -; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13 -; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14 -; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15 -; OPT-NEXT: [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0 -; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1 -; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2 -; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3 -; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4 -; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5 -; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6 -; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7 -; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8 -; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9 -; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10 -; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11 -; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12 -; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13 -; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14 -; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15 -; OPT-NEXT: [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0 -; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1 -; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2 -; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3 -; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4 -; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5 -; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6 -; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7 -; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8 -; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9 -; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10 -; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11 -; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12 -; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13 -; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14 -; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15 -; OPT-NEXT: [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0 -; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1 -; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2 -; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3 -; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4 -; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5 -; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6 -; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7 -; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8 -; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9 -; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10 -; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11 -; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12 -; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13 -; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14 -; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15 -; OPT-NEXT: [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0 -; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1 -; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2 -; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3 -; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4 -; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5 -; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6 -; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7 -; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8 -; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9 -; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10 -; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11 -; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12 -; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13 -; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14 -; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15 -; OPT-NEXT: [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0 -; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1 -; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2 -; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3 -; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4 -; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5 -; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6 -; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7 -; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8 -; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9 -; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10 -; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11 -; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12 -; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13 -; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14 -; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15 -; OPT-NEXT: [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80 -; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0 -; OPT-NEXT: [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81 -; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1 -; OPT-NEXT: [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82 -; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2 -; OPT-NEXT: [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83 -; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3 -; OPT-NEXT: [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84 -; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4 -; OPT-NEXT: [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85 -; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5 -; OPT-NEXT: [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86 -; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6 -; OPT-NEXT: [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87 -; OPT-NEXT: [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7 -; OPT-NEXT: [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88 -; OPT-NEXT: [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8 -; OPT-NEXT: [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89 -; OPT-NEXT: [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9 -; OPT-NEXT: [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90 -; OPT-NEXT: [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10 -; OPT-NEXT: [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91 -; OPT-NEXT: [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11 -; OPT-NEXT: [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92 -; OPT-NEXT: [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12 -; OPT-NEXT: [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93 -; OPT-NEXT: [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13 -; OPT-NEXT: [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94 -; OPT-NEXT: [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14 -; OPT-NEXT: [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95 -; OPT-NEXT: [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15 -; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]] -; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16 -; OPT-NEXT: ret void -; -entry: - %alloca = alloca [8 x <16 x i8>], align 16, addrspace(5) - %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16 - %sum = add <16 x i8> %load, %add - store <16 x i8> %sum, ptr addrspace(3) %out, align 16 - ret void -} - -attributes #0 = {"amdgpu-waves-per-eu"="2,2"} From 2a65fab965d60939ebf5d2ba9dab03ac3a69d846 Mon Sep 17 00:00:00 2001 From: Kai Nacke <kai.peter.nacke@ibm.com> Date: Tue, 4 Nov 2025 15:07:20 -0500 Subject: [PATCH 226/313] [z/OS] Improve compiler options on z/OS (#166415) `_XPLATFORM_SOURCE` needs to be defined to improve source code compatibility (e.g. for `O_CLOEXEC`). The define `_UNIX03_THREADS` can be removed, because it is automatically set by `_XOPEN_SOURCE=600`. See the documentation of feature test macros: https://www.ibm.com/docs/en/zos/3.1.0?topic=files-feature-test-macros Tested on z/OS 3.1 with the Open XL C/C++ 2.2 compiler. --- llvm/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index c450ee5a3d72e..f192cd05b5a34 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -1264,10 +1264,10 @@ endif() # Build with _XOPEN_SOURCE on z/OS. if (CMAKE_SYSTEM_NAME MATCHES "OS390") add_compile_definitions(_XOPEN_SOURCE=600) + add_compile_definitions(_XPLATFORM_SOURCE) # Needed e.g. for O_CLOEXEC. add_compile_definitions(_OPEN_SYS) # Needed for process information. add_compile_definitions(_OPEN_SYS_FILE_EXT) # Needed for EBCDIC I/O. add_compile_definitions(_EXT) # Needed for file data. - add_compile_definitions(_UNIX03_THREADS) # Multithreading support. # Need to build LLVM as ASCII application. # This can't be a global setting because other projects may # need to be built in EBCDIC mode. From 718818a5cb4ce10aca8852e4d6675bb28ff4eacd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tuomas=20K=C3=A4rn=C3=A4?= <tuomas.karna@intel.com> Date: Tue, 4 Nov 2025 22:08:42 +0200 Subject: [PATCH 227/313] [MLIR][Linalg][Transform] Expose more args in VectorizeChildren[...] op's Python bindings (#166134) Expose missing boolean arguments in `VectorizeChildrenAndApplyPatternsOp` Python bindings. --- mlir/python/mlir/dialects/transform/structured.py | 4 ++++ mlir/test/python/dialects/transform_structured_ext.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py index 14c7380e432f0..d9ab504f0de54 100644 --- a/mlir/python/mlir/dialects/transform/structured.py +++ b/mlir/python/mlir/dialects/transform/structured.py @@ -713,6 +713,8 @@ def __init__( disable_transfer_permutation_map_lowering_patterns: bool = False, vectorize_nd_extract: bool = False, vectorize_padding: bool = False, + flatten_1d_depthwise_conv: bool = False, + fold_type_extensions_into_contract: bool = False, loc=None, ip=None, ): @@ -722,8 +724,10 @@ def __init__( target, disable_multi_reduction_to_contract_patterns=disable_multi_reduction_to_contract_patterns, disable_transfer_permutation_map_lowering_patterns=disable_transfer_permutation_map_lowering_patterns, + flatten_1d_depthwise_conv=flatten_1d_depthwise_conv, vectorize_nd_extract=vectorize_nd_extract, vectorize_padding=vectorize_padding, + fold_type_extensions_into_contract=fold_type_extensions_into_contract, loc=loc, ip=ip, ) diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py index d6b70dc9d1978..e58b7646316fc 100644 --- a/mlir/test/python/dialects/transform_structured_ext.py +++ b/mlir/test/python/dialects/transform_structured_ext.py @@ -627,12 +627,16 @@ def testVectorizeChildrenAndApplyPatternsAllAttrs(target): disable_transfer_permutation_map_lowering_patterns=True, vectorize_nd_extract=True, vectorize_padding=True, + flatten_1d_depthwise_conv=True, + fold_type_extensions_into_contract=True, ) # CHECK-LABEL: TEST: testVectorizeChildrenAndApplyPatternsAllAttrs # CHECK: transform.sequence # CHECK: = transform.structured.vectorize # CHECK-SAME: disable_multi_reduction_to_contract_patterns # CHECK-SAME: disable_transfer_permutation_map_lowering_patterns + # CHECK-SAME: flatten_1d_depthwise_conv + # CHECK-SAME: fold_type_extensions_into_contract # CHECK-SAME: vectorize_nd_extract # CHECK-SAME: vectorize_padding @@ -646,12 +650,16 @@ def testVectorizeChildrenAndApplyPatternsNoAttrs(target): disable_transfer_permutation_map_lowering_patterns=False, vectorize_nd_extract=False, vectorize_padding=False, + flatten_1d_depthwise_conv=False, + fold_type_extensions_into_contract=False, ) # CHECK-LABEL: TEST: testVectorizeChildrenAndApplyPatternsNoAttrs # CHECK: transform.sequence # CHECK: = transform.structured.vectorize # CHECK-NOT: disable_multi_reduction_to_contract_patterns # CHECK-NOT: disable_transfer_permutation_map_lowering_patterns + # CHECK-NOT: flatten_1d_depthwise_conv + # CHECK-NOT: fold_type_extensions_into_contract # CHECK-NOT: vectorize_nd_extract # CHECK-NOT: vectorize_padding From 6f91f588d98aab7a0deb9db76b7c44ae237ff4e0 Mon Sep 17 00:00:00 2001 From: Hans Wennborg <hans@hanshq.net> Date: Tue, 4 Nov 2025 21:19:32 +0100 Subject: [PATCH 228/313] build_llvm_release.bat: Put the 32-bit sanitizers back (#166437) Follow-up to bcb3d2f5122276ed9969fe2b2ef4428652800377. Even though 32-bit win/asan is not well supported, we shouldn't drop it without some discussion at least. Also, we probably shouldn't drop the other sanitizers that are gated by COMPILER_RT_BUILD_SANITIZERS. The tests no longer pass after switching to the runtimes build however (I believe that build mode runs more of the tests?), so disable them. --- llvm/utils/release/build_llvm_release.bat | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat index 0764c7af86c0a..3aeedaa3288a2 100644 --- a/llvm/utils/release/build_llvm_release.bat +++ b/llvm/utils/release/build_llvm_release.bat @@ -220,7 +220,6 @@ set "stage0_bin_dir=%build_dir%/build32_stage0/bin" set cmake_flags=^ %common_cmake_flags% ^ -DLLVM_ENABLE_RPMALLOC=OFF ^ - -DCOMPILER_RT_BUILD_SANITIZERS=OFF ^ -DPython3_ROOT_DIR=%PYTHONHOME% ^ -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^ -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib @@ -230,7 +229,7 @@ ninja || ninja || ninja || exit /b 1 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1 +REM ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 cd.. @@ -239,7 +238,6 @@ REM with forward slash. set all_cmake_flags=^ %cmake_flags% ^ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;" ^ - -DCOMPILER_RT_BUILD_SANITIZERS=OFF ^ %common_lldb_flags% ^ -DPYTHON_HOME=%PYTHONHOME% ^ -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^ @@ -256,7 +254,7 @@ ninja || ninja || ninja || exit /b 1 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1 +REM ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 ninja package || exit /b 1 cd .. From 240fe7e8445eee0ee6bca9c537ede3fdcb820f24 Mon Sep 17 00:00:00 2001 From: Amr Hesham <amr96@programmer.net> Date: Tue, 4 Nov 2025 21:21:56 +0100 Subject: [PATCH 229/313] [CIR][NFC] EHScope & Cleanups Iterators and operators overloading (#165317) Upstream EHScope & Cleanup iterators, helpers and operator overloading as a prerequisite for #165158 Issue https://github.com/llvm/llvm-project/issues/154992 --- clang/lib/CIR/CodeGen/CIRGenCleanup.cpp | 8 +-- clang/lib/CIR/CodeGen/CIRGenCleanup.h | 71 ++++++++++++++++++++++--- clang/lib/CIR/CodeGen/EHScopeStack.h | 8 +++ 3 files changed, 78 insertions(+), 9 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp index 851328a7db680..437db306f3369 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp @@ -147,8 +147,8 @@ void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) { assert(!cir::MissingFeatures::innermostEHScope()); - EHCleanupScope *scope = new (buffer) - EHCleanupScope(size, branchFixups.size(), innermostNormalCleanup); + EHCleanupScope *scope = new (buffer) EHCleanupScope( + size, branchFixups.size(), innermostNormalCleanup, innermostEHScope); if (isNormalCleanup) innermostNormalCleanup = stable_begin(); @@ -191,7 +191,9 @@ void EHScopeStack::popCleanup() { EHCatchScope *EHScopeStack::pushCatch(unsigned numHandlers) { char *buffer = allocate(EHCatchScope::getSizeForNumHandlers(numHandlers)); assert(!cir::MissingFeatures::innermostEHScope()); - EHCatchScope *scope = new (buffer) EHCatchScope(numHandlers); + EHCatchScope *scope = + new (buffer) EHCatchScope(numHandlers, innermostEHScope); + innermostEHScope = stable_begin(); return scope; } diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h index 61a09a59b05c0..a035d792ef6d1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h +++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h @@ -30,6 +30,8 @@ struct CatchTypeInfo { /// A protected scope for zero-cost EH handling. class EHScope { + EHScopeStack::stable_iterator enclosingEHScope; + class CommonBitFields { friend class EHScope; unsigned kind : 3; @@ -79,7 +81,10 @@ class EHScope { public: enum Kind { Cleanup, Catch, Terminate, Filter }; - EHScope(Kind kind) { commonBits.kind = kind; } + EHScope(Kind kind, EHScopeStack::stable_iterator enclosingEHScope) + : enclosingEHScope(enclosingEHScope) { + commonBits.kind = kind; + } Kind getKind() const { return static_cast<Kind>(commonBits.kind); } @@ -90,6 +95,10 @@ class EHScope { assert(!cir::MissingFeatures::ehstackBranches()); return false; } + + EHScopeStack::stable_iterator getEnclosingEHScope() const { + return enclosingEHScope; + } }; /// A scope which attempts to handle some, possibly all, types of @@ -111,6 +120,8 @@ class EHCatchScope : public EHScope { /// The catch handler for this type. mlir::Region *region; + + bool isCatchAll() const { return type.rtti == nullptr; } }; private: @@ -118,12 +129,18 @@ class EHCatchScope : public EHScope { Handler *getHandlers() { return reinterpret_cast<Handler *>(this + 1); } + const Handler *getHandlers() const { + return reinterpret_cast<const Handler *>(this + 1); + } + public: static size_t getSizeForNumHandlers(unsigned n) { return sizeof(EHCatchScope) + n * sizeof(Handler); } - EHCatchScope(unsigned numHandlers) : EHScope(Catch) { + EHCatchScope(unsigned numHandlers, + EHScopeStack::stable_iterator enclosingEHScope) + : EHScope(Catch, enclosingEHScope) { catchBits.numHandlers = numHandlers; assert(catchBits.numHandlers == numHandlers && "NumHandlers overflow?"); } @@ -136,6 +153,11 @@ class EHCatchScope : public EHScope { getHandlers()[i].region = region; } + const Handler &getHandler(unsigned i) const { + assert(i < getNumHandlers()); + return getHandlers()[i]; + } + // Clear all handler blocks. // FIXME: it's better to always call clearHandlerBlocks in DTOR and have a // 'takeHandler' or some such function which removes ownership from the @@ -144,6 +166,10 @@ class EHCatchScope : public EHScope { // The blocks are owned by TryOp, nothing to delete. } + using iterator = const Handler *; + iterator begin() const { return getHandlers(); } + iterator end() const { return getHandlers() + getNumHandlers(); } + static bool classof(const EHScope *scope) { return scope->getKind() == Catch; } @@ -176,9 +202,10 @@ class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope } EHCleanupScope(unsigned cleanupSize, unsigned fixupDepth, - EHScopeStack::stable_iterator enclosingNormal) - : EHScope(EHScope::Cleanup), enclosingNormal(enclosingNormal), - fixupDepth(fixupDepth) { + EHScopeStack::stable_iterator enclosingNormal, + EHScopeStack::stable_iterator enclosingEH) + : EHScope(EHScope::Cleanup, enclosingEH), + enclosingNormal(enclosingNormal), fixupDepth(fixupDepth) { // TODO(cir): When exception handling is upstreamed, isNormalCleanup and // isEHCleanup will be arguments to the constructor. cleanupBits.isNormalCleanup = true; @@ -235,13 +262,45 @@ class EHScopeStack::iterator { EHScope *get() const { return reinterpret_cast<EHScope *>(ptr); } + EHScope *operator->() const { return get(); } EHScope &operator*() const { return *get(); } + + iterator &operator++() { + size_t size; + switch (get()->getKind()) { + case EHScope::Catch: + size = EHCatchScope::getSizeForNumHandlers( + static_cast<const EHCatchScope *>(get())->getNumHandlers()); + break; + + case EHScope::Filter: + llvm_unreachable("EHScopeStack::iterator Filter"); + break; + + case EHScope::Cleanup: + llvm_unreachable("EHScopeStack::iterator Cleanup"); + break; + + case EHScope::Terminate: + llvm_unreachable("EHScopeStack::iterator Terminate"); + break; + } + ptr += llvm::alignTo(size, ScopeStackAlignment); + return *this; + } + + bool operator==(iterator other) const { return ptr == other.ptr; } + bool operator!=(iterator other) const { return ptr != other.ptr; } }; inline EHScopeStack::iterator EHScopeStack::begin() const { return iterator(startOfData); } +inline EHScopeStack::iterator EHScopeStack::end() const { + return iterator(endOfBuffer); +} + inline EHScopeStack::iterator EHScopeStack::find(stable_iterator savePoint) const { assert(savePoint.isValid() && "finding invalid savepoint"); @@ -254,7 +313,7 @@ inline void EHScopeStack::popCatch() { assert(!empty() && "popping exception stack when not empty"); EHCatchScope &scope = llvm::cast<EHCatchScope>(*begin()); - assert(!cir::MissingFeatures::innermostEHScope()); + innermostEHScope = scope.getEnclosingEHScope(); deallocate(EHCatchScope::getSizeForNumHandlers(scope.getNumHandlers())); } diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h index 4198c23c9cbed..9005b0106b2a4 100644 --- a/clang/lib/CIR/CodeGen/EHScopeStack.h +++ b/clang/lib/CIR/CodeGen/EHScopeStack.h @@ -155,6 +155,9 @@ class EHScopeStack { /// The innermost normal cleanup on the stack. stable_iterator innermostNormalCleanup = stable_end(); + /// The innermost EH scope on the stack. + stable_iterator innermostEHScope = stable_end(); + /// The CGF this Stack belong to CIRGenFunction *cgf = nullptr; @@ -226,6 +229,8 @@ class EHScopeStack { } stable_iterator getInnermostActiveNormalCleanup() const; + stable_iterator getInnermostEHScope() const { return innermostEHScope; } + /// An unstable reference to a scope-stack depth. Invalidated by /// pushes but not pops. class iterator; @@ -233,6 +238,9 @@ class EHScopeStack { /// Returns an iterator pointing to the innermost EH scope. iterator begin() const; + /// Returns an iterator pointing to the outermost EH scope. + iterator end() const; + /// Create a stable reference to the top of the EH stack. The /// returned reference is valid until that scope is popped off the /// stack. From 4f428d30e4d8287169fbc2acfcf37ca7b37ed539 Mon Sep 17 00:00:00 2001 From: Nick Sarnie <nick.sarnie@intel.com> Date: Wed, 5 Nov 2025 05:34:42 +0900 Subject: [PATCH 230/313] [clang][lit] Add SPIR-V to some OpenMP offload tests (#165775) Just to get a little more test coverage. Signed-off-by: Nick Sarnie <nick.sarnie@intel.com> --- clang/test/OpenMP/metadirective_ast_print.c | 36 ++++++++++++---- .../metadirective_device_arch_codegen.cpp | 28 ++++++++----- clang/test/OpenMP/thread_limit_amdgpu.c | 34 --------------- clang/test/OpenMP/thread_limit_gpu.c | 41 +++++++++++++++++++ 4 files changed, 86 insertions(+), 53 deletions(-) delete mode 100644 clang/test/OpenMP/thread_limit_amdgpu.c create mode 100644 clang/test/OpenMP/thread_limit_gpu.c diff --git a/clang/test/OpenMP/metadirective_ast_print.c b/clang/test/OpenMP/metadirective_ast_print.c index 638dbae1bc774..75ef5fa26827c 100644 --- a/clang/test/OpenMP/metadirective_ast_print.c +++ b/clang/test/OpenMP/metadirective_ast_print.c @@ -2,17 +2,25 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT -// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN +// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU -// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN +// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU + +// RUN: %clang_cc1 -verify -fopenmp -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU + +// RUN: %clang_cc1 -verify -fopenmp-simd -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU // expected-no-diagnostics #ifndef HEADER @@ -77,6 +85,12 @@ void foo1(void) { for (int i = 0; i < 100; i++) ; +#pragma omp metadirective when(device={arch("spirv64")}: \ + teams distribute parallel for)\ + otherwise(parallel for) + for (int i = 0; i < 100; i++) + ; + #pragma omp metadirective when(implementation = {extension(match_all)} \ : nothing) otherwise(parallel for) for (int i = 0; i < 16; i++) @@ -134,8 +148,8 @@ void foo1(void) { // OMP52-NEXT: for (int i = 0; i < 16; i++) { // OMP52-NEXT: #pragma omp simd // OMP52-NEXT: for (int j = 0; j < 16; j++) -// OMP52-AMDGCN: #pragma omp teams distribute parallel for -// OMP52-AMDGCN-NEXT: for (int i = 0; i < 100; i++) +// OMP52-GPU: #pragma omp teams distribute parallel for +// OMP52-GPU-NEXT: for (int i = 0; i < 100; i++) // OMP52: for (int i = 0; i < 16; i++) // OMP52: for (int i = 0; i < 16; i++) @@ -198,6 +212,12 @@ void foo2(void) { for (int i = 0; i < 100; i++) ; +#pragma omp metadirective when(device={arch("spirv64")}: \ + teams distribute parallel for)\ + default(parallel for) + for (int i = 0; i < 100; i++) + ; + #pragma omp metadirective when(implementation = {extension(match_all)} \ : nothing) default(parallel for) for (int i = 0; i < 16; i++) @@ -266,8 +286,8 @@ void foo2(void) { // DEFAULT-NEXT: for (int i = 0; i < 16; i++) { // DEFAULT-NEXT: #pragma omp simd // DEFAULT-NEXT: for (int j = 0; j < 16; j++) -// DEFAULT-AMDGCN: #pragma omp teams distribute parallel for -// DEFAULT-AMDGCN-NEXT: for (int i = 0; i < 100; i++) +// DEFAULT-GPU: #pragma omp teams distribute parallel for +// DEFAULT-GPU-NEXT: for (int i = 0; i < 100; i++) // DEFAULT: for (int i = 0; i < 16; i++) // DEFAULT: for (int i = 0; i < 16; i++) diff --git a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp index eecae310d0a77..1d5584de67162 100644 --- a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp +++ b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp @@ -1,7 +1,7 @@ -// REQUIRES: amdgpu-registered-target - // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -target-cpu gfx906 -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-ppc-spirv-host.bc +// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-spirv-host.bc -o - | FileCheck %s // expected-no-diagnostics @@ -16,6 +16,12 @@ Inspired from SOLLVE tests: #define N 1024 +#ifdef __AMDGPU__ +#define GPU "amdgcn" +#else +#define GPU "spirv64" +#endif + int metadirective1() { int v1[N], v2[N], v3[N]; @@ -26,7 +32,7 @@ int metadirective1() { #pragma omp target map(to:v1,v2) map(from:v3, target_device_num) device(default_device) { #pragma omp metadirective \ - when(device={arch("amdgcn")}: teams distribute parallel for) \ + when(device={arch(GPU)}: teams distribute parallel for) \ default(parallel for) for (int i = 0; i < N; i++) { @@ -38,28 +44,28 @@ int metadirective1() { return errors; } -// CHECK: define weak_odr protected amdgpu_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]] +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]] // CHECK: entry: -// CHECK: %{{[0-9]}} = call i32 @__kmpc_target_init +// CHECK: %{{[0-9]}} = call{{.*}} i32 @__kmpc_target_init // CHECK: user_code.entry: -// CHECK: call void @[[METADIRECTIVE]]_omp_outlined -// CHECK-NOT: call void @__kmpc_parallel_51 +// CHECK: call{{.*}} void @[[METADIRECTIVE]]_omp_outlined +// CHECK-NOT: call{{.*}} void @__kmpc_parallel_51 // CHECK: ret void // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined // CHECK: entry: -// CHECK: call void @__kmpc_distribute_static_init +// CHECK: call{{.*}} void @__kmpc_distribute_static_init // CHECK: omp.loop.exit: -// CHECK: call void @__kmpc_distribute_static_fini +// CHECK: call{{.*}} void @__kmpc_distribute_static_fini // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined_omp_outlined // CHECK: entry: -// CHECK: call void @__kmpc_for_static_init_4 +// CHECK: call{{.*}} void @__kmpc_for_static_init_4 // CHECK: omp.inner.for.body: // CHECK: store atomic {{.*}} monotonic // CHECK: omp.loop.exit: -// CHECK-NEXT: call void @__kmpc_for_static_fini +// CHECK-NEXT: call{{.*}} void @__kmpc_for_static_fini // CHECK-NEXT: ret void diff --git a/clang/test/OpenMP/thread_limit_amdgpu.c b/clang/test/OpenMP/thread_limit_amdgpu.c deleted file mode 100644 index f884eeb73c3ff..0000000000000 --- a/clang/test/OpenMP/thread_limit_amdgpu.c +++ /dev/null @@ -1,34 +0,0 @@ -// Test target codegen - host bc file has to be created first. -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -// expected-no-diagnostics - -#ifndef HEADER -#define HEADER - -void foo(int N) { -#pragma omp target teams distribute parallel for simd - for (int i = 0; i < N; ++i) - ; -#pragma omp target teams distribute parallel for simd thread_limit(4) - for (int i = 0; i < N; ++i) - ; -#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) - for (int i = 0; i < N; ++i) - ; -#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22) - for (int i = 0; i < N; ++i) - ; -} - -#endif - -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l10({{.*}}) #[[ATTR1:.+]] { -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l13({{.*}}) #[[ATTR2:.+]] { -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l16({{.*}}) #[[ATTR3:.+]] { -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l19({{.*}}) #[[ATTR4:.+]] { - -// CHECK: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} } -// CHECK: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} } -// CHECK: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} } -// CHECK: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} } diff --git a/clang/test/OpenMP/thread_limit_gpu.c b/clang/test/OpenMP/thread_limit_gpu.c new file mode 100644 index 0000000000000..4bcc14d070c22 --- /dev/null +++ b/clang/test/OpenMP/thread_limit_gpu.c @@ -0,0 +1,41 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-AMDGPU %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-x86-spirv-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-spirv-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-SPIRV %s +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +void foo(int N) { +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < N; ++i) + ; +#pragma omp target teams distribute parallel for simd thread_limit(4) + for (int i = 0; i < N; ++i) + ; +#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) + for (int i = 0; i < N; ++i) + ; +#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22) + for (int i = 0; i < N; ++i) + ; +} + +#endif + +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l12({{.*}}) #[[ATTR1:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l15({{.*}}) #[[ATTR2:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l18({{.*}}) #[[ATTR3:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l21({{.*}}) #[[ATTR4:.+]] { + +// CHECK-AMDGPU: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} } +// CHECK-AMDGPU: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} } +// CHECK-AMDGPU: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} } +// CHECK-AMDGPU: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} } + +// CHECK-SPIRV: attributes #[[ATTR1]] = { {{.*}} "omp_target_thread_limit"="256" {{.*}} } +// CHECK-SPIRV: attributes #[[ATTR2]] = { {{.*}} "omp_target_thread_limit"="4" {{.*}} } +// CHECK-SPIRV: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="42" {{.*}} } +// CHECK-SPIRV: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="22" {{.*}} } From 2141edf506baab7e526f3a305bcdb6d6f2c772bc Mon Sep 17 00:00:00 2001 From: Adrian Prantl <aprantl@apple.com> Date: Tue, 4 Nov 2025 12:53:03 -0800 Subject: [PATCH 231/313] [lldb] Skip tests on older versions of clang --- .../TestLibcxxInternalsRecognizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py index d8a729b322fe4..eeb5d1b554b01 100644 --- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py +++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py @@ -9,7 +9,7 @@ class LibCxxInternalsRecognizerTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True @add_test_categories(["libc++"]) - @skipIf(compiler="clang", compiler_version=["<", "19.0"]) + @skipIf(compiler="clang", compiler_version=["<", "21.0"]) def test_frame_recognizer(self): """Test that implementation details of libc++ are hidden""" self.build() From 9703bda95b088bb6a455ef9faffdb41c537aff2f Mon Sep 17 00:00:00 2001 From: Charitha Saumya <136391709+charithaintc@users.noreply.github.com> Date: Tue, 4 Nov 2025 13:15:32 -0800 Subject: [PATCH 232/313] [mlir][xegpu] Add OptimizeBlockLoads pass. (#165483) This pass rewrites certain xegpu `CreateNd` and `LoadNd` operations that feeds into `vector.transpose` to more optimal form to improve performance. Specifically, low precision (bitwidth < 32) `LoadNd` ops that feeds into transpose ops are rewritten to i32 loads with a valid transpose layout such that later passes can use the load with transpose HW feature to accelerate such load ops. **Update:** Pass is renamed to `OptimizeBlockLoads ` because later we plan to add the array length optimization into this pass as well. This will break down a larger load (like `32x32xf16`) into more DPAS-favorable array length loads (`32x16xf16` with array length = 2). Both these optmizations require rewriting `CreateNd` and `LoadNd` and it makes sense to have a common pass for both. --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 12 + .../Dialect/XeGPU/Transforms/Transforms.h | 3 +- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 9 + .../Dialect/XeGPU/Transforms/CMakeLists.txt | 1 + .../Transforms/XeGPUOptimizeBlockLoads.cpp | 490 ++++++++++++++++++ .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 36 +- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 26 + .../Dialect/XeGPU/optimize-transpose.mlir | 280 ++++++++++ 8 files changed, 827 insertions(+), 30 deletions(-) create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp create mode 100644 mlir/test/Dialect/XeGPU/optimize-transpose.mlir diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index eb05628d4772b..e42799689e490 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -85,4 +85,16 @@ def XeGPUVectorLinearize : Pass<"xegpu-vector-linearize"> { "scf::SCFDialect", "ub::UBDialect", "vector::VectorDialect"]; } +def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> { + let summary = "Optimize XeGPU block load operations"; + let description = [{ + This pass rewrites XeGPU loadNd operations into more optimal forms + to improve performance. This includes, + - Rewriting transpose B loads into more optimal forms to use HW block + transpose instructions for better performance. + }]; + let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", + "vector::VectorDialect"]; +} + #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h index a480195eebd00..1776a209d0bf1 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h @@ -61,7 +61,8 @@ struct UnrollOptions { /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`. void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns); - +/// Appends patterns for optimizing block load operations into `patterns`. +void populateXeGPUOptimizeBlockLoadsPatterns(RewritePatternSet &patterns); /// Appends patterns for XeGPU SIMT distribution into `patterns`. void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns); /// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op. diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 620a2fe43d682..58092c3bb9ed2 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -166,6 +166,15 @@ SmallVector<OpFoldResult> addElementwise(OpBuilder &builder, Location loc, SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef<OpFoldResult> lhs, ArrayRef<OpFoldResult> rhs); + +/// Helper Function to find a proper instruction multiple for the user-supplied +/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes. +/// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or +/// array length). +template <typename T> +int getLargestDivisor(T dim, ArrayRef<T> candidates, + ArrayRef<T> candidateMultiples = {}); + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index e6f76067094ce..29b645feab2c6 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms XeGPUWgToSgDistribute.cpp XeGPUPropagateLayout.cpp XeGPUVectorLinearize.cpp + XeGPUOptimizeBlockLoads.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp new file mode 100644 index 0000000000000..4dc5ea4f7bb24 --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -0,0 +1,490 @@ +//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/Transforms/Patterns.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/Dialect/XeGPU/Transforms/Passes.h" +#include "mlir/Dialect/XeGPU/Transforms/Transforms.h" +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" +#include "mlir/Dialect/XeGPU/uArch/uArchBase.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/IR/Types.h" +#include "mlir/IR/Value.h" +#include "mlir/Transforms/DialectConversion.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include <optional> + +namespace mlir { +namespace xegpu { +#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS +#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" +} // namespace xegpu +} // namespace mlir + +#define DEBUG_TYPE "xegpu-optimize-block-loads" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") + +using namespace mlir; + +namespace { + +/// Get the 2D lane data from a tensor desc type if it exists. +static std::optional<SmallVector<int64_t>> +getMaybeLaneData(xegpu::TensorDescType tdescType) { + auto layout = tdescType.getLayoutAttr(); + if (!layout) + return std::nullopt; + auto laneData = layout.getEffectiveLaneDataAsInt(); + if (laneData.size() != 2) + return std::nullopt; + return laneData; +} + +/// Get the 2D lane layout from a tensor desc type if it exists. +static std::optional<SmallVector<int64_t>> +getMaybeLaneLayout(xegpu::TensorDescType tdescType) { + auto layout = tdescType.getLayoutAttr(); + if (!layout) + return std::nullopt; + auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); + if (laneLayout.size() != 2) + return std::nullopt; + return laneLayout; +} + +/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 && +/// lane[1] == 1), but inner lane data is not equal to [1, 1]. +/// Example: +/// !xegpu.tensor_desc<16x16xf16, +/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>> +/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form) +/// indicating that this is a load that requires transpose effect. However, +/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from +/// the inner dimension. We convert this to a optimized form by converting the +/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the +/// later lowering easily use the load with transpose instruction. +static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout, + ArrayRef<int64_t> laneData) { + if (laneLayout.size() != 2 || laneData.size() != 2) + return false; + if (laneLayout[0] == 1 || laneLayout[1] != 1) + return false; + if (laneData[0] != 1 || laneData[1] == 1) + return false; + return true; +} + +/// A tensor desc type can be optimized if its element type is less than 32 bits +/// and its layout can be optimized. +static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) { + // If the dtype is greater or equal to 32 bits, layout must be valid. + int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); + if (elementTyBitwidth >= 32) + return false; + auto maybeLaneLayout = getMaybeLaneLayout(tdescType); + auto maybeLaneData = getMaybeLaneData(tdescType); + if (!maybeLaneData || !maybeLaneLayout) + return false; + return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData); +} + +/// Check if a tensor desc type can be optimized for transpose, if so return the +/// new optimized tensor desc type with a valid transpose layout. +static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType, + const uArch *targetuArch) { + if (!canBeOptimizedForTranspose(tdescType)) + return tdescType; + auto laneData = getMaybeLaneData(tdescType) + .value(); // Lane data must exist if we reach here. + int64_t innerLaneData = laneData[1]; + int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); + // Required shape is total shape of the vector result that this tensor desc + // must eventually load after adjusting for the new bitwidth and array + // length. + SmallVector<int64_t> requiredShape(tdescType.getShape()); + requiredShape.back() = + requiredShape.back() * tdescType.getArrayLength() / innerLaneData; + int newBitWidth = elementTyBitwidth * innerLaneData; + Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth); + // Supported shape is the max transpose shape that can be supported by + // hardware that is less than or equal to required shape. + auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>( + targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad)); + auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount( + newElemTy, /** has transform */ false, /** has transpose */ true); + // If no HW params found, return the original type. + if (!maybeHWParams) + return tdescType; + auto [widths, heights, counts] = maybeHWParams.value(); + // TODO: Currently we expect array length to be 1 for transpose case. + if (counts.size() != 1 || counts[0] != 1) + return tdescType; + int arrayLen = counts[0]; + int supportedHeight = + xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights); + int supportedWidth = + xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths); + // If no supported height or width found, return the original type. + if (supportedHeight == -1 || supportedWidth == -1) + return tdescType; + + SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth}; + xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get( + tdescType.getContext(), + tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1}); + // Array length can not be larger than 1 for transpose case. + return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen, + tdescType.getBoundaryCheck(), + tdescType.getMemorySpace(), newLayout); +} + +/// Helper to convert an OpFoldResult to Value. +static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc, + OpFoldResult ofr) { + std::optional<int64_t> mayBeInt = getConstantIntValue(ofr); + if (mayBeInt) + return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult(); + return llvm::cast<Value>(ofr); +} + +/// Helper to divide a Value by a constant integer. +static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc, + Value val, int64_t constant) { + // If the constant is a power of 2, use right shift for division. + if (llvm::isPowerOf2_64(constant)) { + int64_t shiftAmount = llvm::Log2_64(constant); + return arith::ShRUIOp::create( + rewriter, loc, val, + arith::ConstantIndexOp::create(rewriter, loc, shiftAmount) + .getResult()) + .getResult(); + } + auto constantOp = + arith::ConstantIndexOp::create(rewriter, loc, constant).getResult(); + return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult(); +} + +/// This function takes a larger register block `data` and generates multiple +/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block +/// starting from `offsets`. +static Value generateLoads(ConversionPatternRewriter &rewriter, + TypedValue<VectorType> data, + SmallVector<OpFoldResult> offsets, + TypedValue<xegpu::TensorDescType> newTensorDesc, + xegpu::LoadNdOp origLoadOp) { + Location loc = data.getLoc(); + assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp"); + Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]); + Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]); + SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape()); + // Compute the ratio between original shape and supported shape. We need to + // generate loads in this ratio arrangement. + auto shapeRatio = computeShapeRatio(data.getType().getShape(), + supportedShape) + .value(); // `ratio` must be defined if we reach here. + for (int64_t h = 0; h < shapeRatio[0]; ++h) { + for (int64_t w = 0; w < shapeRatio[1]; ++w) { + int64_t localOffsetDim0 = h * supportedShape[0]; + int64_t localOffsetDim1 = w * supportedShape[1]; + Value loadOffsetX = arith::AddIOp::create( + rewriter, loc, offsetDim0, + arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0) + .getResult()); + Value loadOffsetY = arith::AddIOp::create( + rewriter, loc, offsetDim1, + arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1) + .getResult()); + auto loadOp = xegpu::LoadNdOp::create( + rewriter, loc, + VectorType::get(supportedShape, data.getType().getElementType()), + newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY}, + origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(), + origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(), + origLoadOp.getL3HintAttr()); + // Set the layout for the loadOp. + auto layoutAttr = newTensorDesc.getType().getLayoutAttr(); + xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr); + // Insert the loaded block into the right position in data. + auto insertOp = vector::InsertStridedSliceOp::create( + rewriter, loc, loadOp.getResult(), data, + ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1}, + ArrayRef<int64_t>{1, 1}); + // InsertOp must have the same layout as newTensorDesc. + xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr); + data = insertOp.getResult(); + } + } + return data; +} + +/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a +/// new CreateNdDescOp with optimized tensor desc type. This involves extracting +/// the base pointer from the original memory source and adjusting the shape and +/// strides of the tensor desc to fit with the new optimized transpose layout. +class XeGPUCreateNdDescOpPattern final + : public OpConversionPattern<xegpu::CreateNdDescOp> { +public: + using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto tdescTy = createNdOp.getType(); + // Get the target uArch info. + auto chipStr = xegpu::getChipStr(createNdOp); + // Check if the chip is supported. + assert( + chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") && + "Expecting target chip to be pvc or bmg for transpose optimization."); + const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value()); + + auto convertType = tryOptimize(tdescTy, targetuArch); + if (convertType == tdescTy) + return failure(); + auto strides = createNdOp.getMixedStrides(); + auto maybeConstInnerStride = getConstantIntValue(strides.back()); + // Only row-major memrefs are expected for now. + if (!maybeConstInnerStride || *maybeConstInnerStride != 1) + return rewriter.notifyMatchFailure( + createNdOp, "Expecting row-major memref for transpose optimization."); + Value source = createNdOp.getSource(); + auto optionalLaneData = getMaybeLaneData(tdescTy); + assert(optionalLaneData && "Expected 2D lane data"); + auto laneData = optionalLaneData.value(); + int64_t innerLaneData = laneData[1]; + auto memrefType = dyn_cast<MemRefType>(source.getType()); + // Inner dimension of the shape must be adjusted based on innerLaneData. + SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes()); + modifiedShape.back() = divideByConstant( + rewriter, createNdOp.getLoc(), + convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()), + innerLaneData); + // Similarly, second to last stride must be adjusted. + assert(strides.size() >= 2 && + "Expected at least 2 strides for CreateNdDescOp"); + SmallVector<OpFoldResult> modifiedStrides(strides); + modifiedStrides[modifiedStrides.size() - 2] = divideByConstant( + rewriter, createNdOp.getLoc(), + convertToValue(rewriter, createNdOp.getLoc(), + modifiedStrides[modifiedStrides.size() - 2]), + innerLaneData); + + // If the source is a static memref, we need to extract the pointer to + // base address. + if (memrefType && memrefType.hasStaticShape()) { + auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create( + rewriter, createNdOp.getLoc(), source); + source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(), + rewriter.getI64Type(), + extractOp.getResult()) + .getResult(); + } + // Create a new CreateNdDescOp with the modified shape and converted type. + auto newCreateNdDescOp = xegpu::CreateNdDescOp::create( + rewriter, createNdOp.getLoc(), convertType, source, modifiedShape, + modifiedStrides); + rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult()); + return success(); + } +}; + +/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for +/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the +/// adjusted tensor desc type. This can result in multiple LoadNdOps being +/// generated to fill in the original load shape. +class XeGPULoadNdDescOpPattern final + : public OpConversionPattern<xegpu::LoadNdOp> { +public: + using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto origTensorDescType = loadNdOp.getTensorDescType(); + auto adaptorType = + cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType()); + if (adaptorType == origTensorDescType) + return failure(); + // Offsets must be adjusted based on innerLaneData. + auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value(); + int64_t innerLaneData = laneData[1]; + auto offsets = loadNdOp.getMixedOffsets(); + if (offsets.empty()) + return rewriter.notifyMatchFailure(loadNdOp, + "Expecting offsets in LoadNd"); + SmallVector<OpFoldResult> modifiedOffsets(offsets); + modifiedOffsets.back() = divideByConstant( + rewriter, loadNdOp.getLoc(), + convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()), + innerLaneData); + // Get the 2D data shape of this loadNdOp in its original type including + // array length. + SmallVector<int64_t> origDataShape(origTensorDescType.getShape()); + // Adjust the data shape based on innerLaneData. + origDataShape.back() /= innerLaneData; + // HW supported shape is the new tensor desc shape after conversion. + SmallVector<int64_t> hwSupportedShape(adaptorType.getShape()); + VectorType origVectorType = + VectorType::get(origDataShape, adaptorType.getElementType()); + Value data; + // Orig data shape is 3D for the array length case. + if (origTensorDescType.getArrayLength() > 1) { + SmallVector<Value> arraySlices; + for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) { + Value slice = arith::ConstantOp::create( + rewriter, loadNdOp->getLoc(), origVectorType, + rewriter.getZeroAttr(origVectorType)); + // Increase the Y offset for each array slice. + Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(), + modifiedOffsets.back()); + modifiedOffsets.back() = + arith::AddIOp::create( + rewriter, loadNdOp->getLoc(), offsetY, + arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(), + i * origDataShape[1]) + .getResult()) + .getResult(); + slice = generateLoads( + rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets, + cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()), + loadNdOp); + // BitCast back to original load shape without array length. + auto bitcastType = VectorType::get(origTensorDescType.getShape(), + origTensorDescType.getElementType()); + auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), + bitcastType, slice); + // BitCastOp must have the same layout as the original loadNdOp. + xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); + arraySlices.push_back(bitCastOp.getResult()); + } + rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices}); + return success(); + } + data = arith::ConstantOp::create( + rewriter, loadNdOp->getLoc(), + VectorType::get(origDataShape, adaptorType.getElementType()), + rewriter.getZeroAttr(origVectorType)); + data = generateLoads( + rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets, + cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()), + loadNdOp); + auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), + loadNdOp.getType(), data); + // BitCastOp must have the same layout as the original loadNdOp. + xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); + rewriter.replaceOp(loadNdOp, bitCastOp); + return success(); + } +}; + +/// Vector ExtractOp must be processed if the original tensor desc type has +/// array length greater than 1. In this case, the LoadNdOp is replaced with +/// multiple LoadNdOps for each array slice making the extraction unnecessary. +/// In this case, we simply remove the ExtractOp. +class VectorExtractOpPattern final + : public OpConversionPattern<vector::ExtractOp> { +public: + using OpConversionPattern<vector::ExtractOp>::OpConversionPattern; + LogicalResult + matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Check if the source of the extraction is split to multiple values. + if (adaptor.getSource().size() == 1) + return failure(); + auto mixedPos = extractOp.getMixedPosition(); + if (mixedPos.size() != 1) + return failure(); + auto mayBeInt = getConstantIntValue(mixedPos[0]); + if (!mayBeInt) + return failure(); + rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]); + return success(); + } +}; + +} // namespace + +void xegpu::populateXeGPUOptimizeBlockLoadsPatterns( + RewritePatternSet &patterns) { + patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern, + VectorExtractOpPattern>(patterns.getContext()); +} + +namespace { + +struct XeGPUOptimizeBlockLoadsPass final + : public xegpu::impl::XeGPUOptimizeBlockLoadsBase< + XeGPUOptimizeBlockLoadsPass> { + void runOnOperation() override { + MLIRContext &context = getContext(); + TypeConverter converter; + RewritePatternSet patterns(&context); + ConversionTarget target(context); + + // This pass is only meant for PVC and BMG targets. If unsupported target + // is found, exit early. + bool isTargetSupported = false; + getOperation()->walk([&](gpu::GPUFuncOp funcOp) { + auto chipStr = xegpu::getChipStr(funcOp); + if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg")) + isTargetSupported = true; + }); + + if (!isTargetSupported) { + DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets." + << "\n"; + return; + } + + // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be + // converted. + target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>( + [&](xegpu::CreateNdDescOp createNdOp) { + return !canBeOptimizedForTranspose(createNdOp.getType()); + }); + target.addDynamicallyLegalOp<xegpu::LoadNdOp>( + [&](xegpu::LoadNdOp loadNdOp) { + return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType()); + }); + // Vector ExtractOps can have optimizable layouts if they extract from + // LoadNdOps with array length greater than 1. These ExtractOps must be + // converted. + target.addDynamicallyLegalOp<vector::ExtractOp>( + [&](vector::ExtractOp extractOp) { + auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult()); + if (!layout) + return true; + auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); + auto laneData = layout.getEffectiveLaneDataAsInt(); + return !canBeOptimizedForTranspose(laneLayout, laneData); + }); + converter.addConversion([](Type type) { return type; }); + + target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect, + vector::VectorDialect>(); + scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, + target); + xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns); + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns)))) { + DBGS() << "Optimize block loads pass failed.\n"; + return signalPassFailure(); + } + } +}; + +} // namespace diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 14c49e7f45706..4e1a539771d2f 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -204,28 +204,6 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> { using Lattice::Lattice; }; -/// Helper Function to find a proper instruction multiple for the user-supplied -/// sg-level data shape. `candidates` are uArch allowed shapes. -/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count). -template <typename T> -int getLargestDivisor(T dim, ArrayRef<T> candidates, - ArrayRef<T> candidateMultiples = {}) { - static_assert(std::is_integral<T>::value, "T must be an integer type"); - int largest = -1; - SmallVector<T> multiples = {1}; - if (!candidateMultiples.empty()) - multiples = - SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end()); - for (T candidate : candidates) { - for (T multiple : multiples) { - int value = static_cast<int>(candidate * multiple); - if (value != 0 && dim % value == 0 && value > largest) - largest = value; - } - } - return largest; -} - /// Helper Functions to get default layouts. A `default layout` is a layout that /// is assigned to a value when the layout is not fixed by some anchor operation /// (like DPAS). @@ -505,7 +483,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( prefetch.emitWarning("No known block params found for the element type."); auto [bWidth, bHeight, bCount] = blockWHC.value(); SmallVector<int> instData; - int instWidth = getLargestDivisor( + int instWidth = xegpu::getLargestDivisor( static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, bCount); if (instWidth == -1) @@ -514,7 +492,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( if (tdescTy.getRank() == 1) instData = {instWidth}; else { - int instHeight = getLargestDivisor( + int instHeight = xegpu::getLargestDivisor( static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight); if (instHeight == -1) prefetch.emitWarning( @@ -634,7 +612,7 @@ void LayoutInfoPropagation::visitDpasOp( const unsigned dataALen = aTy.getShape().front(); auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType()); const int maxALen = - getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen)); + xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen)); if (maxALen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); @@ -642,7 +620,7 @@ void LayoutInfoPropagation::visitDpasOp( const unsigned dataBLen = bTy.getShape().back(); auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType()); const int maxBLen = - getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen)); + xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen)); if (maxBLen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); @@ -662,7 +640,7 @@ void LayoutInfoPropagation::visitDpasOp( const unsigned dataCLen = bTy.getShape().back(); auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType()); const int maxCLen = - getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen)); + xegpu::getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen)); if (maxCLen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); @@ -691,7 +669,7 @@ void LayoutInfoPropagation::visitStoreNdOp( store.emitWarning("No known block params found for the element type."); auto [bWidth, bHeight, bCount] = blockWHC.value(); SmallVector<int> instData; - int instWidth = getLargestDivisor( + int instWidth = xegpu::getLargestDivisor( static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, bCount); if (instWidth == -1) @@ -700,7 +678,7 @@ void LayoutInfoPropagation::visitStoreNdOp( if (dataTy.getRank() == 1) instData = {instWidth}; else { - int instHeight = getLargestDivisor( + int instHeight = xegpu::getLargestDivisor( static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight); if (instHeight == -1) store.emitWarning( diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index d575a415a3035..de9e09d427665 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -555,3 +555,29 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc, results.append(addElementwise(builder, loc, a, b)); return results; } + +template <typename T> +int xegpu::getLargestDivisor(T dim, ArrayRef<T> candidates, + ArrayRef<T> candidateMultiples) { + static_assert(std::is_integral<T>::value, "T must be an integer type"); + int largest = -1; + SmallVector<T> multiples = {1}; + if (!candidateMultiples.empty()) + multiples = + SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end()); + for (T candidate : candidates) { + for (T multiple : multiples) { + int value = static_cast<int>(candidate * multiple); + if (value != 0 && dim % value == 0 && value > largest) + largest = value; + } + } + return largest; +} + +/// Explicit instantiations +template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates, + ArrayRef<int> candidateMultiples); +template int +xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates, + ArrayRef<unsigned> candidateMultiples); diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir new file mode 100644 index 0000000000000..24a0de6ed48a5 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -0,0 +1,280 @@ +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \ +// RUN: --xegpu-optimize-block-loads --canonicalize --cse --split-input-file %s | FileCheck %s + +// CHECK-LABEL: gpu.func @no_scf( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> { +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[C32:.*]] = arith.constant 32 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xf16> -> index +// CHECK: %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK-NEXT: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[B]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xi32> to vector<16x16xf16> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +gpu.module @xevm_module { +gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x16xf32> { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %1 = xegpu.load_nd %0[%c0, %c32] { result_layout = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #a } : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + gpu.return %6 : vector<8x16xf32> +} +} + +// ----- +// CHECK-LABEL: gpu.func @no_scf_i8( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xi8>, %{{.*}}: vector<8x32xi8>) -> vector<8x16xi32> { +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xi8> -> index +// CHECK: %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> +// CHECK: %[[T3:.*]] = vector.bitcast %[[T2]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]>} : vector<16x8xi32> to vector<16x32xi8> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]> +#c = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +gpu.module @xevm_module { +gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8x16xi32> { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xi8> -> !xegpu.tensor_desc<16x32xi8, #b> + %1 = xegpu.load_nd %0[%c0, %c64] { result_layout = #b } : !xegpu.tensor_desc<16x32xi8, #b> -> vector<16x32xi8> + %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x32xi8> to vector<32x16xi8> + %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #c } : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> + gpu.return %6 : vector<8x16xi32> +} +} + + +// ----- +// CHECK-LABEL: gpu.func @gemm_b_transpose( +// CHECK-SAME: %{{.*}} memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[C256:.*]] = arith.constant 256 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%c128, 1] +// CHECK-SAME: : i64 -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { +// CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index +// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> +// CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +gpu.module @xevm_module { +gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) { + %5 = xegpu.load_nd %2[%c0, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16> + %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %7 = vector.transpose %6, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %8 : vector<8x16xf32> + } {layout_result_0 = #a} + xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @nested_scf( +// CHECK-SAME: %{{.*}}: memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[C256:.*]] = arith.constant 256 : index +// CHECK: scf.for %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { +// CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index +// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] {layout_result_0 = #xegpu.layout< +// CHECK-SAME: lane_layout = [16, 1], lane_data = [1, 1]>} : +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> +// CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +gpu.module @xevm_module { +gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c256 = arith.constant 256 : index + scf.for %arg8 = %c0 to %c256 step %c16 { + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%arg8, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) { + %5 = xegpu.load_nd %2[%arg8, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16> + %6 = xegpu.load_nd %3[%arg8, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %7 = vector.transpose %6, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %8 : vector<8x16xf32> + } {layout_result_0 = #a} + xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + } + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @large_loads( +// CHECK-SAME: %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<32x16xi32> +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { +// CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index +// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32> +// CHECK: %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 0], strides = [1, 1]} +// CHECK-SAME: : vector<32x8xi32> into vector<32x16xi32> +// CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32> +// CHECK: %[[T10:.*]] = vector.insert_strided_slice %[[T9]], %[[T7]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 8], strides = [1, 1]} +// CHECK-SAME: : vector<32x8xi32> into vector<32x16xi32> +// CHECK: %{{.*}} = vector.bitcast %[[T10]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-SAME: : vector<32x16xi32> to vector<32x32xf16> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +gpu.module @xevm_module { +gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<32x32xf16, #b> + %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1) + -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { + %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + %7 = vector.extract_strided_slice %6 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %8 = vector.extract_strided_slice %6 {offsets = [0, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %9 = vector.extract_strided_slice %6 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %10 = vector.extract_strided_slice %6 {offsets = [16, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %11 = vector.transpose %7, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %12 = vector.transpose %8, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %13 = vector.transpose %9, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %14 = vector.transpose %10, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32> + } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a} + xegpu.store_nd %4#0, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#1, %0[%c0, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#2, %0[%c16, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @array_length( +// CHECK-SAME: %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %arg2: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 -> +// CHECK-SAME: !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { +// CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index +// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32> +// CHECK: %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16> +// CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32> +// CHECK: %[[T10:.*]] = vector.bitcast %[[T9]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +gpu.module @xevm_module { +gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> + -> !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>> + %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1) + -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { + %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } + : !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x32x16xf16> + %19 = vector.extract %6[0] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16> + %20 = vector.extract %6[1] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16> + %7 = vector.extract_strided_slice %19 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %8 = vector.extract_strided_slice %19 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %9 = vector.extract_strided_slice %20 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %10 = vector.extract_strided_slice %20 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %11 = vector.transpose %7, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %12 = vector.transpose %8, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %13 = vector.transpose %9, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %14 = vector.transpose %10, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32> + } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a} + xegpu.store_nd %4#0, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#1, %0[%c0, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#2, %0[%c16, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + gpu.return +} +} From 6e2dcdb17c0ef4eff239a08772b194e2009193ec Mon Sep 17 00:00:00 2001 From: Kaitlin Peng <kaitlinpeng@microsoft.com> Date: Tue, 4 Nov 2025 13:20:46 -0800 Subject: [PATCH 233/313] Fix implicit truncation of `select` non-bool vector conditions (#166279) Fixes #164018. The problem is that we're unable to do an implicit conversion sequence on a template deduced argument, so the current vector templates can't reconcile `vector<int, 4>` with `vector<bool, Sz>`. This PR separates the vector templates into size-specific ones, getting rid of the `Sz` deduction and allowing for the implicit conversion to be done. --- .../lib/Headers/hlsl/hlsl_alias_intrinsics.h | 52 ++++++++++++--- clang/test/CodeGenHLSL/builtins/select.hlsl | 64 ++++++++++++++----- .../test/SemaHLSL/BuiltIns/select-errors.hlsl | 2 +- 3 files changed, 93 insertions(+), 25 deletions(-) diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h index 4c5861c2c5f9d..208776eb7840e 100644 --- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h @@ -2111,9 +2111,17 @@ T select(bool, T, T); /// \param FalseVals The vector values are chosen from when conditions are /// false. -template <typename T, int Sz> +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 2> select(vector<bool, 2>, vector<T, 2>, vector<T, 2>); + +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 3> select(vector<bool, 3>, vector<T, 3>, vector<T, 3>); + +template <typename T> _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, vector<T, Sz>); +vector<T, 4> select(vector<bool, 4>, vector<T, 4>, vector<T, 4>); /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, T TrueVal, /// vector<T,Sz> FalseVals) @@ -2123,9 +2131,17 @@ vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, vector<T, Sz>); /// \param FalseVals The vector values are chosen from when conditions are /// false. -template <typename T, int Sz> +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 2> select(vector<bool, 2>, T, vector<T, 2>); + +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 3> select(vector<bool, 3>, T, vector<T, 3>); + +template <typename T> _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -vector<T, Sz> select(vector<bool, Sz>, T, vector<T, Sz>); +vector<T, 4> select(vector<bool, 4>, T, vector<T, 4>); /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, vector<T,Sz> TrueVals, /// T FalseVal) @@ -2134,9 +2150,17 @@ vector<T, Sz> select(vector<bool, Sz>, T, vector<T, Sz>); /// \param TrueVals The vector values are chosen from when conditions are true. /// \param FalseVal The scalar value to splat from when conditions are false. -template <typename T, int Sz> +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 2> select(vector<bool, 2>, vector<T, 2>, T); + +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector<T, 3> select(vector<bool, 3>, vector<T, 3>, T); + +template <typename T> _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, T); +vector<T, 4> select(vector<bool, 4>, vector<T, 4>, T); /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, vector<T,Sz> TrueVals, /// T FalseVal) @@ -2145,10 +2169,20 @@ vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, T); /// \param TrueVal The scalar value to splat from when conditions are true. /// \param FalseVal The scalar value to splat from when conditions are false. -template <typename T, int Sz> +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 2>> select( + vector<bool, 2>, T, T); + +template <typename T> +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 3>> select( + vector<bool, 3>, T, T); + +template <typename T> _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, Sz>> select( - vector<bool, Sz>, T, T); +__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 4>> select( + vector<bool, 4>, T, T); //===----------------------------------------------------------------------===// // sin builtins diff --git a/clang/test/CodeGenHLSL/builtins/select.hlsl b/clang/test/CodeGenHLSL/builtins/select.hlsl index 7590b4a881259..e5169844cb3f2 100644 --- a/clang/test/CodeGenHLSL/builtins/select.hlsl +++ b/clang/test/CodeGenHLSL/builtins/select.hlsl @@ -20,16 +20,6 @@ struct S test_select_infer_struct(bool cond0, struct S tVal, struct S fVal) { return select(cond0, tVal, fVal); } -// CHECK-LABEL: test_select_infer_array -// CHECK: [[TRUE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4 -// CHECK: [[FALSE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4 -// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, [3 x i32] [[TRUE_VAL]], [3 x i32] [[FALSE_VAL]] -// CHECK: store [3 x i32] [[SELECT]], ptr {{%.*}}, align 4 -// CHECK: ret void -int test_select_infer_array(bool cond, int tVal[3], int fVal[3])[3] { - return select(cond, tVal, fVal); -} - // CHECK-LABEL: test_select_bool_vector // CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}} // CHECK: ret <2 x i32> [[SELECT]] @@ -38,24 +28,24 @@ int2 test_select_bool_vector(bool cond0, int2 tVal, int2 fVal) { } // CHECK-LABEL: test_select_vector_1 -// CHECK: [[SELECT:%.*]] = select <1 x i1> {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}} +// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}} // CHECK: ret <1 x i32> [[SELECT]] int1 test_select_vector_1(bool1 cond0, int1 tVals, int1 fVals) { - return select<int,1>(cond0, tVals, fVals); + return select(cond0, tVals, fVals); } // CHECK-LABEL: test_select_vector_2 // CHECK: [[SELECT:%.*]] = select <2 x i1> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}} // CHECK: ret <2 x i32> [[SELECT]] int2 test_select_vector_2(bool2 cond0, int2 tVals, int2 fVals) { - return select<int,2>(cond0, tVals, fVals); + return select(cond0, tVals, fVals); } // CHECK-LABEL: test_select_vector_3 // CHECK: [[SELECT:%.*]] = select <3 x i1> {{%.*}}, <3 x i32> {{%.*}}, <3 x i32> {{%.*}} // CHECK: ret <3 x i32> [[SELECT]] int3 test_select_vector_3(bool3 cond0, int3 tVals, int3 fVals) { - return select<int,3>(cond0, tVals, fVals); + return select(cond0, tVals, fVals); } // CHECK-LABEL: test_select_vector_4 @@ -86,10 +76,54 @@ int4 test_select_vector_vector_scalar(bool4 cond0, int4 tVals, int fVal) { // CHECK-LABEL: test_select_vector_scalar_scalar // CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0 // CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer -// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 %3, i64 0 +// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0 // CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK: [[SELECT:%.*]] = select <4 x i1> {{%.*}}, <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]] // CHECK: ret <4 x i32> [[SELECT]] int4 test_select_vector_scalar_scalar(bool4 cond0, int tVal, int fVal) { return select(cond0, tVal, fVal); } + +// CHECK-LABEL: test_select_nonbool_cond_vector_4 +// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16 +// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer +// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i1> {{%.*}}, <4 x i1> {{%.*}} +// CHECK: ret <4 x i1> [[SELECT]] +bool4 test_select_nonbool_cond_vector_4(int4 cond0, bool4 tVal, bool4 fVal) { + return select(cond0, tVal, fVal); +} + +// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_vector +// CHECK: [[TMP0:%.*]] = load <3 x i32>, ptr %cond0.addr, align 16 +// CHECK: [[TOBOOL:%.*]] = icmp ne <3 x i32> [[TMP0]], zeroinitializer +// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <3 x i32> poison, i32 {{%.*}}, i64 0 +// CHECK: [[SPLAT1:%.*]] = shufflevector <3 x i32> [[SPLAT_SRC1]], <3 x i32> poison, <3 x i32> zeroinitializer +// CHECK: [[SELECT:%.*]] = select <3 x i1> [[TOBOOL]], <3 x i32> [[SPLAT1]], <3 x i32> {{%.*}} +// CHECK: ret <3 x i32> [[SELECT]] +int3 test_select_nonbool_cond_vector_scalar_vector(int3 cond0, int tVal, int3 fVal) { + return select(cond0, tVal, fVal); +} + +// CHECK-LABEL: test_select_nonbool_cond_vector_vector_scalar +// CHECK: [[TMP0:%.*]] = load <2 x i32>, ptr %cond0.addr, align 8 +// CHECK: [[TOBOOL:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer +// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <2 x i32> poison, i32 {{%.*}}, i64 0 +// CHECK: [[SPLAT1:%.*]] = shufflevector <2 x i32> [[SPLAT_SRC1]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK: [[SELECT:%.*]] = select <2 x i1> [[TOBOOL]], <2 x i32> {{%.*}}, <2 x i32> [[SPLAT1]] +// CHECK: ret <2 x i32> [[SELECT]] +int2 test_select_nonbool_cond_vector_vector_scalar(int2 cond0, int2 tVal, int fVal) { + return select(cond0, tVal, fVal); +} + +// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_scalar +// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16 +// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer +// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0 +// CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0 +// CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]] +// CHECK: ret <4 x i32> [[SELECT]] +int4 test_select_nonbool_cond_vector_scalar_scalar(int4 cond0, int tVal, int fVal) { + return select(cond0, tVal, fVal); +} diff --git a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl index 12c818acec035..b2f45051a9bd8 100644 --- a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl @@ -15,7 +15,7 @@ int2 test_select_vector_vals_not_vecs(bool2 p0, int t0, } int1 test_select_vector_vals_wrong_size(bool2 p0, int1 t0, int1 f0) { - return select<int,1>(p0, t0, f0); // expected-warning{{implicit conversion truncates vector: 'bool2' (aka 'vector<bool, 2>') to 'vector<bool, 1>' (vector of 1 'bool' value)}} + return select<int1>(p0, t0, f0); // No diagnostic expected. } int test_select_no_args() { From 1c85981e3770bfcc8f4c11417be0cfa7de543a15 Mon Sep 17 00:00:00 2001 From: Paul Kirth <paulkirth@google.com> Date: Tue, 4 Nov 2025 13:21:04 -0800 Subject: [PATCH 234/313] [llvm][mustache] Avoid redundant saves in accessor splitting (#159197) The splitMustacheString function was saving StringRefs that were already backed by an arena-allocated string. This was unnecessary work. This change removes the redundant Ctx.Saver.save() call. This optimization provides a small but measurable performance improvement on top of the single-pass tokenizer, most notably reducing branch misses. Metric | Baseline | Optimized | Change -------------- | -------- | --------- | ------- Time (ms) | 35.77 | 35.57 | -0.56% Cycles | 35.16M | 34.91M | -0.71% Instructions | 85.77M | 85.54M | -0.27% Branch Misses | 113.9K | 111.9K | -1.76% Cache Misses | 237.7K | 242.1K | +1.85% --- llvm/lib/Support/Mustache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 24e3105c5e8a9..012e1ffd534d2 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -51,7 +51,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { std::tie(Part, Str) = Str.split('.'); // Each part of the accessor needs to be saved to the arena // to ensure it has a stable address. - Tokens.push_back(Ctx.Saver.save(Part.trim())); + Tokens.push_back(Part.trim()); } } // Now, allocate memory for the array of StringRefs in the arena. From ce091da5df6c095585b9cda48843f4a0a4952b79 Mon Sep 17 00:00:00 2001 From: Syadus Sefat <42645939+mssefat@users.noreply.github.com> Date: Tue, 4 Nov 2025 15:37:27 -0600 Subject: [PATCH 235/313] [AMDGPU] Mark WMMA machine instructions as convergent (#165602) The WMMA MI(s) are missing the isConvergent flag. This causes incorrect behavior in passes like machine-sink, where WMMA instructions get sunk into divergent branches. This patch fixes the issue by setting the isConvergent flag to 1 in the VOP3PInstructions.td file. --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 10 +++-- .../CodeGen/AMDGPU/wmma-gfx12-convergent.mir | 42 +++++++++++++++++++ 2 files changed, 48 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 4ae2c1ed04dae..31d8bce4d0c87 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1707,7 +1707,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2"); defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in { let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; @@ -1734,7 +1734,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P let mayRaiseFPException = 0; let ReadsModeReg = 0; let AsmMatchConverter = "cvtSWMMAC"; - + let isConvergent = 1; let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; } } @@ -1906,8 +1906,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">; } // End is_wmma_xdl = 1. -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +let isConvergent = 1 in { + defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; + defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +} } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir new file mode 100644 index 0000000000000..df3e780c61f46 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir @@ -0,0 +1,42 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s + +--- +name: wmma_test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: wmma_test + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[DEF]], 8, [[DEF1]], 8, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[DEF2]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY %3.sub1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + %0:vreg_128 = IMPLICIT_DEF + %1:vreg_128 = IMPLICIT_DEF + %2:sreg_32 = IMPLICIT_DEF + early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %0:vreg_128, 8, %1:vreg_128, 8, 0, 0, 0, implicit $exec + %4:sreg_32 = SI_IF %2:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %5:vreg_256 = COPY %3.sub1:vreg_256 + + bb.2: + SI_END_CF %4:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_ENDPGM 0 + +... From 8cd22447af239206daabb42fc63e2824a1f7fb6e Mon Sep 17 00:00:00 2001 From: Saleem Abdulrasool <compnerd@compnerd.org> Date: Tue, 4 Nov 2025 13:40:42 -0800 Subject: [PATCH 236/313] AArch64: correct `preserve_most` and `preserve_all` on Windows (#166436) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes register information handling for the `preserve_most` and `preserve_all` calling conventions on Windows ARM64. The root issue was cascading `if` statements whose behavior depended on their order. This patch makes the minimal, tactical change needed for Swift’s two calling conventions, unblocking current work. A broader refactor to remove the ordering dependency is still desired and will follow in a subsequent PR. --- .../AArch64/AArch64CallingConvention.td | 6 +++ .../Target/AArch64/AArch64RegisterInfo.cpp | 14 +++++-- llvm/test/CodeGen/AArch64/preserve_mostcc.ll | 38 ++++++++++++------- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 1b5a713bffdc9..34c85d588f9c4 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -601,6 +601,12 @@ def CSR_Win_AArch64_AAPCS_SwiftError def CSR_Win_AArch64_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_Win_AArch64_AAPCS, X20, X22)>; +def CSR_Win_AArch64_RT_MostRegs + : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, (sequence "X%u", 9, 15))>; + +def CSR_Win_AArch64_RT_AllRegs + : CalleeSavedRegs<(add CSR_Win_AArch64_RT_MostRegs, (sequence "Q%u", 8, 31))>; + // The Control Flow Guard check call uses a custom calling convention that also // preserves X0-X8 and Q0-Q7. def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 5bfb19d9a7e61..a5048b9c9e61d 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -90,6 +90,16 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin()) return getDarwinCalleeSavedRegs(MF); + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return MF->getSubtarget<AArch64Subtarget>().isTargetWindows() + ? CSR_Win_AArch64_RT_MostRegs_SaveList + : CSR_AArch64_RT_MostRegs_SaveList; + + if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll) + return MF->getSubtarget<AArch64Subtarget>().isTargetWindows() + ? CSR_Win_AArch64_RT_AllRegs_SaveList + : CSR_AArch64_RT_AllRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) return CSR_Win_AArch64_CFGuard_Check_SaveList; if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) { @@ -138,10 +148,6 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AAPCS_SwiftError_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail) return CSR_AArch64_AAPCS_SwiftTail_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) - return CSR_AArch64_RT_MostRegs_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll) - return CSR_AArch64_RT_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::Win64) // This is for OSes other than Windows; Windows is a separate case further // above. diff --git a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll index 7f0968c8eb339..75c8567e2095e 100644 --- a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll +++ b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck -check-prefix CHECK -check-prefix CHECK-DARWIN %s +; RUN: llc < %s -mtriple=aarch64-unknown-windiws-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s declare void @standard_cc_func() declare preserve_mostcc void @preserve_mostcc_func() @@ -8,18 +9,26 @@ declare preserve_mostcc void @preserve_mostcc_func() define preserve_mostcc void @preserve_mostcc1() nounwind { entry: ;CHECK-LABEL: preserve_mostcc1 -;CHECK-NOT: stp -;CHECK-NOT: str -;CHECK: str x15 -;CHECK-NEXT: stp x14, x13, -;CHECK-NEXT: stp x12, x11, -;CHECK-NEXT: stp x10, x9, -;CHECK: bl _standard_cc_func +;CHECK-DARWIN-NOT: stp +;CHECK-DARWIN-NOT: str +;CHECK-DARWIN: str x15 +;CHECK-DARWIN-NEXT: stp x14, x13, +;CHECK-DARWIN-NEXT: stp x12, x11, +;CHECK-DARWIN-NEXT: stp x10, x9, +;CHECK-WIN: stp x15, x14 +;CHECK-WIN-NEXT: stp x13, x12, +;CHECK-WIN-NEXT: stp x11, x10, +;CHECK-WIN-NEXT: stp x9, x30 +;CHECK: bl {{_?}}standard_cc_func call void @standard_cc_func() -;CHECK: ldp x10, x9, -;CHECK-NEXT: ldp x12, x11, -;CHECK-NEXT: ldp x14, x13, -;CHECK-NEXT: ldr x15 +;CHECK-DARWIN: ldp x10, x9, +;CHECK-DARWIN-NEXT: ldp x12, x11, +;CHECK-DARWIN-NEXT: ldp x14, x13, +;CHECK-DARWIN-NEXT: ldr x15 +;CHECK-WIN: ldp x9, x30 +;CHECK-WIN-NEXT: ldp x11, x10, +;CHECK-WIN-NEXT: ldp x13, x12, +;CHECK-WIN-NEXT: ldp x15, x14, ret void } @@ -31,9 +40,10 @@ define preserve_mostcc void @preserve_mostcc2() nounwind { entry: ;CHECK-LABEL: preserve_mostcc2 ;CHECK-NOT: x14 -;CHECK: stp x29, x30, +;CHECK-DARWIN: stp x29, x30, +;CHECK-WIN: str x30 ;CHECK-NOT: x14 -;CHECK: bl _preserve_mostcc_func +;CHECK: bl {{_?}}preserve_mostcc_func call preserve_mostcc void @preserve_mostcc_func() ret void } From 025e431e7450cada2724b19eb59354a6c020fa4f Mon Sep 17 00:00:00 2001 From: Alireza Torabian <alireza.torabian@huawei.com> Date: Tue, 4 Nov 2025 16:48:39 -0500 Subject: [PATCH 237/313] [LoopFusion] Forget loop and block dispositions after latch merge (#166233) Merging the latches of loops may affect the dispositions, so they should be forgotten after the merge. This patch fixed the crash in loop fusion [#164082](https://github.com/llvm/llvm-project/issues/164082). --- llvm/lib/Transforms/Scalar/LoopFuse.cpp | 16 +++-- llvm/test/Transforms/LoopFusion/pr164082.ll | 65 +++++++++++++++++++++ 2 files changed, 75 insertions(+), 6 deletions(-) create mode 100644 llvm/test/Transforms/LoopFusion/pr164082.ll diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 19eccb9e17020..9ffa602416b05 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1796,14 +1796,16 @@ struct LoopFuser { // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { @@ -2092,14 +2094,16 @@ struct LoopFuser { // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { diff --git a/llvm/test/Transforms/LoopFusion/pr164082.ll b/llvm/test/Transforms/LoopFusion/pr164082.ll new file mode 100644 index 0000000000000..652557cef48f8 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/pr164082.ll @@ -0,0 +1,65 @@ +; REQUIRES: asserts +; RUN: opt -passes=loop-fusion -disable-output -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s +; STAT: 1 loop-fusion - Loops fused + +; C Code +; +;; for (int i = 0; i < 100; ++i) +;; Array[i][i] = -i; +;; for (int row = 0; row < 100; ++row) +;; for (int col = 0; col < 100; ++col) +;; if (col != row) +;; Array[row][col] = row + col; +; +; Loop fusion should not crash anymore as now forgetBlockAndLoopDispositions() +; is trigerred after mergeLatch() during the fusion. + +define i32 @forget_dispositions() nounwind { +entry: + %Array = alloca [100 x [100 x i32]], align 4 + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv33 = phi i64 [ 0, %entry ], [ %indvars.iv.next34, %for.body ] + %0 = trunc i64 %indvars.iv33 to i32 + %sub = sub i32 0, %0 + %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv33, i64 %indvars.iv33 + store i32 %sub, ptr %arrayidx2, align 4 + %indvars.iv.next34 = add i64 %indvars.iv33, 1 + %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32 + %exitcond36 = icmp eq i32 %lftr.wideiv35, 100 + br i1 %exitcond36, label %for.cond6.preheader, label %for.body + +for.cond6.preheader: ; preds = %for.body, %for.inc17 + %indvars.iv29 = phi i64 [ %indvars.iv.next30, %for.inc17 ], [ 0, %for.body ] + br label %for.body8 + +for.body8: ; preds = %for.inc14, %for.cond6.preheader + %indvars.iv = phi i64 [ 0, %for.cond6.preheader ], [ %indvars.iv.next, %for.inc14 ] + %1 = trunc i64 %indvars.iv to i32 + %2 = trunc i64 %indvars.iv29 to i32 + %cmp9 = icmp eq i32 %1, %2 + br i1 %cmp9, label %for.inc14, label %if.then + +if.then: ; preds = %for.body8 + %3 = add i64 %indvars.iv, %indvars.iv29 + %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv29, i64 %indvars.iv + %4 = trunc i64 %3 to i32 + store i32 %4, ptr %arrayidx13, align 4 + br label %for.inc14 + +for.inc14: ; preds = %for.body8, %if.then + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv27 = trunc i64 %indvars.iv.next to i32 + %exitcond28 = icmp eq i32 %lftr.wideiv27, 100 + br i1 %exitcond28, label %for.inc17, label %for.body8 + +for.inc17: ; preds = %for.inc14 + %indvars.iv.next30 = add i64 %indvars.iv29, 1 + %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32 + %exitcond32 = icmp eq i32 %lftr.wideiv31, 100 + br i1 %exitcond32, label %for.exit, label %for.cond6.preheader + +for.exit: ; preds = %for.inc17 + ret i32 0 +} From b21949eb34037aa1811cc55609ab46c577feab63 Mon Sep 17 00:00:00 2001 From: Han-Chung Wang <hanhan0912@gmail.com> Date: Tue, 4 Nov 2025 13:49:46 -0800 Subject: [PATCH 238/313] Revert "[mlir][memref]: Collapse strided unit dim even if strides are dynamic" (#166448) Reverts llvm/llvm-project#157330 The original revision introduces a bug in `isGuaranteedCollapsible`. The `memref<3x3x1x96xf32, strided<[288, 96, 96, 1], offset: 864>>` is no longer collapsable with the change. The revision reverts the change to bring back correct behavior. `stride` should be computed as `96` like the old behavior in the failed iteration. https://github.com/llvm/llvm-project/blob/92a1eb37122fa24e3045fbabdea2bf87127cace5/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp#L2597-L2605 --- mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp | 10 +++++----- mlir/test/Dialect/MemRef/ops.mlir | 7 +------ 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index e271ac58db327..1c21a2f270da6 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -2568,11 +2568,6 @@ computeCollapsedLayoutMap(MemRefType srcType, auto trailingReassocs = ArrayRef<int64_t>(reassoc).drop_front(); auto stride = SaturatedInteger::wrap(resultStrides[resultStrideIndex--]); for (int64_t idx : llvm::reverse(trailingReassocs)) { - // Dimensions of size 1 should be skipped, because their strides are - // meaningless and could have any arbitrary value. - if (srcShape[idx - 1] == 1) - continue; - stride = stride * SaturatedInteger::wrap(srcShape[idx]); // Both source and result stride must have the same static value. In that @@ -2587,6 +2582,11 @@ computeCollapsedLayoutMap(MemRefType srcType, if (strict && (stride.saturated || srcStride.saturated)) return failure(); + // Dimensions of size 1 should be skipped, because their strides are + // meaningless and could have any arbitrary value. + if (srcShape[idx - 1] == 1) + continue; + if (!stride.saturated && !srcStride.saturated && stride != srcStride) return failure(); } diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir index b1db99bb3ad08..a90c9505a8405 100644 --- a/mlir/test/Dialect/MemRef/ops.mlir +++ b/mlir/test/Dialect/MemRef/ops.mlir @@ -440,8 +440,7 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>, %arg4: index, %arg5: index, %arg6: index, - %arg7: memref<4x?x4xf32>, - %arg8: memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>>) { + %arg7: memref<4x?x4xf32>) { // CHECK: memref.collapse_shape {{.*}} {{\[}}[0, 1], [2]] // CHECK-SAME: memref<?x?x?xf32> into memref<?x?xf32> %0 = memref.collapse_shape %arg0 [[0, 1], [2]] : @@ -490,10 +489,6 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>, // CHECK: memref.expand_shape {{.*}} {{\[}}[0, 1], [2], [3, 4]] %4 = memref.expand_shape %arg7 [[0, 1], [2], [3, 4]] output_shape [2, 2, %arg4, 2, 2] : memref<4x?x4xf32> into memref<2x2x?x2x2xf32> - -// CHECK: memref.collapse_shape {{.*}} {{\[}}[0, 1], [2], [3]] -// CHECK-SAME: memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>> into memref<1x18x?xsi8, strided<[?, ?, 1], offset: ?>> - %5 = memref.collapse_shape %arg8 [[0, 1], [2], [3]] : memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>> into memref<1x18x?xsi8, strided<[?, ?, 1], offset: ?>> return } From cf73a0b102c012f30e5ee43638a78ed21e6b81b3 Mon Sep 17 00:00:00 2001 From: David Green <david.green@arm.com> Date: Tue, 4 Nov 2025 21:55:57 +0000 Subject: [PATCH 239/313] [AArch64] Copy implicit def operands when creating LDP. (#164253) Otherwise we might end up with undefined register uses. Copying implicit uses can cause problems where a register is both defined and used in the same LDP, so I have not tried to add them here. Fixes #164230 --- .../AArch64/AArch64LoadStoreOptimizer.cpp | 19 +++++ llvm/test/CodeGen/AArch64/ldst-implicitop.mir | 80 +++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/ldst-implicitop.mir diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index e69fa32967a79..2ab7bf19da410 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1386,6 +1386,25 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (MOP.isReg() && MOP.isKill()) DefinedInBB.addReg(MOP.getReg()); + // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but + // only copies implicit defs and makes sure that each operand is only added + // once in case of duplicates. + auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1, + MachineBasicBlock::iterator MI2) { + SmallSetVector<Register, 4> Ops; + for (const MachineOperand &MO : + llvm::drop_begin(MI1->operands(), MI1->getDesc().getNumOperands())) + if (MO.isReg() && MO.isImplicit() && MO.isDef()) + Ops.insert(MO.getReg()); + for (const MachineOperand &MO : + llvm::drop_begin(MI2->operands(), MI2->getDesc().getNumOperands())) + if (MO.isReg() && MO.isImplicit() && MO.isDef()) + Ops.insert(MO.getReg()); + for (auto Op : Ops) + MIB.addDef(Op, RegState::Implicit); + }; + CopyImplicitOps(I, Paired); + // Erase the old instructions. I->eraseFromParent(); Paired->eraseFromParent(); diff --git a/llvm/test/CodeGen/AArch64/ldst-implicitop.mir b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir new file mode 100644 index 0000000000000..34e8cf282669c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir @@ -0,0 +1,80 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=aarch64-- -run-pass=aarch64-ldst-opt -verify-machineinstrs -o - %s | FileCheck %s +# Check that we copy implicit operands. +--- +name: impdef_op1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_op1 + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: RET_ReallyLR + renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + renamable $q20 = LDRQui renamable $lr, 4 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + RET_ReallyLR +... +--- +name: impdef_op2 +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_op2 + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q20, renamable $q5 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: RET_ReallyLR + renamable $q20 = LDRQui renamable $lr, 3 :: (load (s128)) + renamable $q5 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + RET_ReallyLR +... +--- +name: impdef_both +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_both + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5, implicit-def $q20_q21 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: $q2 = ORRv16i8 $q20, killed $q20 + ; CHECK-NEXT: $q3 = ORRv16i8 $q21, killed $q21 + ; CHECK-NEXT: RET_ReallyLR + renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q20_q21 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + $q2 = ORRv16i8 $q20, killed $q20 + $q3 = ORRv16i8 $q21, killed $q21 + RET_ReallyLR +... +--- +name: impdef_both_same +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_both_same + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: RET_ReallyLR + renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + RET_ReallyLR +... From c93df83b0469902ad22de3e98a8325406a96b960 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu <min.hsu@sifive.com> Date: Tue, 4 Nov 2025 14:03:45 -0800 Subject: [PATCH 240/313] [opt-viewer] Account for optimization records YAML files generated by LTO (#135059) The optimization records YAML files generated by Clang's LTO pipeline are named "\*.opt.ld.yaml" rather than "\*.opt.yaml". This patch adds that pattern into the search list of `find_opt_files` as well. --- llvm/tools/opt-viewer/optrecord.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/tools/opt-viewer/optrecord.py b/llvm/tools/opt-viewer/optrecord.py index b9244fd1ae739..07e21028535c4 100644 --- a/llvm/tools/opt-viewer/optrecord.py +++ b/llvm/tools/opt-viewer/optrecord.py @@ -344,6 +344,8 @@ def find_opt_files(*dirs_or_files): d for d in subdirs if not os.path.ismount(os.path.join(dir, d)) ] for file in files: - if fnmatch.fnmatch(file, "*.opt.yaml*"): + if fnmatch.fnmatch(file, "*.opt.yaml*") or fnmatch.fnmatch( + file, "*.opt.ld.yaml*" + ): all.append(os.path.join(dir, file)) return all From d4222bf9e2175dc8d0707442802a222d652d0116 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser <nikolasklauser@berlin.de> Date: Tue, 4 Nov 2025 23:29:13 +0100 Subject: [PATCH 241/313] [libc++] Use saturation builtins directly for {add,sub}_sat (#165228) This doesn't improve performance (except with optimizations disabled), since the compiler is able to fold our current implementation. However, it does significantly reduce the amount of code the compiler has to sift through, reducing compile times a bit. --- libcxx/include/__numeric/saturation_arithmetic.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h index 9bd3af12c9572..7a7410b5dea08 100644 --- a/libcxx/include/__numeric/saturation_arithmetic.h +++ b/libcxx/include/__numeric/saturation_arithmetic.h @@ -30,6 +30,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept { +# if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101 + return __builtin_elementwise_add_sat(__x, __y); +# else if (_Tp __sum; !__builtin_add_overflow(__x, __y, std::addressof(__sum))) return __sum; // Handle overflow @@ -44,10 +47,14 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept { // Overflows if (x < 0 && y < 0) return std::numeric_limits<_Tp>::min(); } +# endif } template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept { +# if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101 + return __builtin_elementwise_sub_sat(__x, __y); +# else if (_Tp __sub; !__builtin_sub_overflow(__x, __y, std::addressof(__sub))) return __sub; // Handle overflow @@ -63,6 +70,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept { // Overflows if (x < 0 && y > 0) return std::numeric_limits<_Tp>::min(); } +# endif } template <__signed_or_unsigned_integer _Tp> From a51d219ee7ac9ca96ade7639bff5097c8b79c130 Mon Sep 17 00:00:00 2001 From: Andrew Haberlandt <ndrewh@users.noreply.github.com> Date: Tue, 4 Nov 2025 14:32:06 -0800 Subject: [PATCH 242/313] [sanitizer_common] Add arm64e module type (#166018) This will fix some symbolication failures on arm64e machines when the symbolicator passes the (wrong) architecture string to atos. --- .../lib/asan/scripts/asan_symbolize.py | 1 + .../lib/sanitizer_common/sanitizer_common.h | 3 ++ .../sanitizer_procmaps_mac.cpp | 47 ++++++++++++------- .../tests/sanitizer_procmaps_test.cpp | 2 +- 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/compiler-rt/lib/asan/scripts/asan_symbolize.py b/compiler-rt/lib/asan/scripts/asan_symbolize.py index 8ecd66c745119..091e9bcc9a796 100755 --- a/compiler-rt/lib/asan/scripts/asan_symbolize.py +++ b/compiler-rt/lib/asan/scripts/asan_symbolize.py @@ -59,6 +59,7 @@ def is_valid_arch(s): "armv7s", "armv7k", "arm64", + "arm64e", "powerpc64", "powerpc64le", "s390x", diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h index ba85a0eb5a35e..b515b15b327d8 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h @@ -737,6 +737,7 @@ enum ModuleArch { kModuleArchARMV7S, kModuleArchARMV7K, kModuleArchARM64, + kModuleArchARM64E, kModuleArchLoongArch64, kModuleArchRISCV64, kModuleArchHexagon @@ -810,6 +811,8 @@ inline const char *ModuleArchToString(ModuleArch arch) { return "armv7k"; case kModuleArchARM64: return "arm64"; + case kModuleArchARM64E: + return "arm64e"; case kModuleArchLoongArch64: return "loongarch64"; case kModuleArchRISCV64: diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp index a9533d6fc04ca..a5ec85ae16460 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp @@ -20,18 +20,21 @@ #include <mach/mach.h> // These are not available in older macOS SDKs. -#ifndef CPU_SUBTYPE_X86_64_H -#define CPU_SUBTYPE_X86_64_H ((cpu_subtype_t)8) /* Haswell */ -#endif -#ifndef CPU_SUBTYPE_ARM_V7S -#define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t)11) /* Swift */ -#endif -#ifndef CPU_SUBTYPE_ARM_V7K -#define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t)12) -#endif -#ifndef CPU_TYPE_ARM64 -#define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64) -#endif +# ifndef CPU_SUBTYPE_X86_64_H +# define CPU_SUBTYPE_X86_64_H ((cpu_subtype_t)8) /* Haswell */ +# endif +# ifndef CPU_SUBTYPE_ARM_V7S +# define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t)11) /* Swift */ +# endif +# ifndef CPU_SUBTYPE_ARM_V7K +# define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t)12) +# endif +# ifndef CPU_TYPE_ARM64 +# define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64) +# endif +# ifndef CPU_SUBTYPE_ARM64E +# define CPU_SUBTYPE_ARM64E ((cpu_subtype_t)2) +# endif namespace __sanitizer { @@ -311,18 +314,26 @@ ModuleArch ModuleArchFromCpuType(cpu_type_t cputype, cpu_subtype_t cpusubtype) { case CPU_TYPE_I386: return kModuleArchI386; case CPU_TYPE_X86_64: - if (cpusubtype == CPU_SUBTYPE_X86_64_ALL) return kModuleArchX86_64; - if (cpusubtype == CPU_SUBTYPE_X86_64_H) return kModuleArchX86_64H; + if (cpusubtype == CPU_SUBTYPE_X86_64_ALL) + return kModuleArchX86_64; + if (cpusubtype == CPU_SUBTYPE_X86_64_H) + return kModuleArchX86_64H; CHECK(0 && "Invalid subtype of x86_64"); return kModuleArchUnknown; case CPU_TYPE_ARM: - if (cpusubtype == CPU_SUBTYPE_ARM_V6) return kModuleArchARMV6; - if (cpusubtype == CPU_SUBTYPE_ARM_V7) return kModuleArchARMV7; - if (cpusubtype == CPU_SUBTYPE_ARM_V7S) return kModuleArchARMV7S; - if (cpusubtype == CPU_SUBTYPE_ARM_V7K) return kModuleArchARMV7K; + if (cpusubtype == CPU_SUBTYPE_ARM_V6) + return kModuleArchARMV6; + if (cpusubtype == CPU_SUBTYPE_ARM_V7) + return kModuleArchARMV7; + if (cpusubtype == CPU_SUBTYPE_ARM_V7S) + return kModuleArchARMV7S; + if (cpusubtype == CPU_SUBTYPE_ARM_V7K) + return kModuleArchARMV7K; CHECK(0 && "Invalid subtype of ARM"); return kModuleArchUnknown; case CPU_TYPE_ARM64: + if (cpusubtype == CPU_SUBTYPE_ARM64E) + return kModuleArchARM64E; return kModuleArchARM64; default: CHECK(0 && "Invalid CPU type"); diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp index 00542b944f516..c18e5bd9f3194 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp @@ -70,7 +70,7 @@ TEST(MemoryMapping, LoadedModuleArchAndUUID) { EXPECT_EQ(arch, kModuleArchI386); } else if (SANITIZER_WORDSIZE == 64) { EXPECT_TRUE(arch == kModuleArchX86_64 || arch == kModuleArchX86_64H || - arch == kModuleArchARM64); + arch == kModuleArchARM64 || arch == kModuleArchARM64E); } const u8 *uuid = modules[i].uuid(); u8 null_uuid[kModuleUUIDSize] = {0}; From ac5b6151976c70c8b676d3bc6ff82895fe0e1d01 Mon Sep 17 00:00:00 2001 From: yonghong-song <yhs@fb.com> Date: Tue, 4 Nov 2025 15:15:33 -0800 Subject: [PATCH 243/313] [BPF] Remove dead code related to __bpf_trap global var (#166440) In [1], the symbol __bpf_trap (macro BPF_TRAP) is removed if it is not used in the code. In the discussion in [1], it is found that the branch "if (Op.isSymbol())" is actually always false. Remove it to avoid confusion. [1] https://github.com/llvm/llvm-project/pull/166003 --- llvm/lib/Target/BPF/BPFAsmPrinter.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index 378a72ab27dd5..abe081c0c76fd 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -176,10 +176,6 @@ void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) { if (const GlobalValue *GV = Op.getGlobal()) if (GV->getName() == BPF_TRAP) SawTrapCall = true; - } else if (Op.isSymbol()) { - if (const MCSymbol *Sym = Op.getMCSymbol()) - if (Sym->getName() == BPF_TRAP) - SawTrapCall = true; } } } From 4cdeb7d292bf28d2be22eb2e3870d0389dc6c860 Mon Sep 17 00:00:00 2001 From: "A. Jiang" <de34@live.cn> Date: Wed, 5 Nov 2025 07:24:59 +0800 Subject: [PATCH 244/313] [libc++] Remove guards for builtins for reference binding traits (#166288) We're only supporting Clang 20+ and Apple Clang 17 now, where these builtins are universally implemented. --- .../reference_constructs_from_temporary.h | 2 +- .../__type_traits/reference_converts_from_temporary.h | 2 +- libcxx/include/tuple | 2 +- .../libcxx/type_traits/no_specializations.verify.cpp | 10 +++------- .../reference_constructs_from_temporary.pass.cpp | 3 --- .../reference_converts_from_temporary.pass.cpp | 3 --- .../tuple.tuple/tuple.apply/make_from_tuple.verify.cpp | 4 ---- 7 files changed, 6 insertions(+), 20 deletions(-) diff --git a/libcxx/include/__type_traits/reference_constructs_from_temporary.h b/libcxx/include/__type_traits/reference_constructs_from_temporary.h index 3d097ce90cb09..a8325620414ea 100644 --- a/libcxx/include/__type_traits/reference_constructs_from_temporary.h +++ b/libcxx/include/__type_traits/reference_constructs_from_temporary.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary) +#if _LIBCPP_STD_VER >= 23 template <class _Tp, class _Up> struct _LIBCPP_NO_SPECIALIZATIONS reference_constructs_from_temporary diff --git a/libcxx/include/__type_traits/reference_converts_from_temporary.h b/libcxx/include/__type_traits/reference_converts_from_temporary.h index c68f1765af9d5..9c51225e53b8e 100644 --- a/libcxx/include/__type_traits/reference_converts_from_temporary.h +++ b/libcxx/include/__type_traits/reference_converts_from_temporary.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_converts_from_temporary) +#if _LIBCPP_STD_VER >= 23 template <class _Tp, class _Up> struct _LIBCPP_NO_SPECIALIZATIONS reference_converts_from_temporary diff --git a/libcxx/include/tuple b/libcxx/include/tuple index 466f501b5f4f8..3c5330dd6e14e 100644 --- a/libcxx/include/tuple +++ b/libcxx/include/tuple @@ -1443,7 +1443,7 @@ template <class _Tp, class _Tuple, class = enable_if_t<__can_make_from_tuple<_Tp inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp make_from_tuple(_Tuple&& __t) noexcept(noexcept(std::__make_from_tuple_impl<_Tp>(std::forward<_Tuple>(__t), make_index_sequence<tuple_size_v<remove_reference_t<_Tuple>>>()))) { -#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary) +#if _LIBCPP_STD_VER >= 23 if constexpr (tuple_size_v<remove_reference_t<_Tuple>> == 1) { static_assert(!std::reference_constructs_from_temporary_v<_Tp, decltype(std::get<0>(std::declval<_Tuple>()))>, "Attempted construction of reference element binds to a temporary whose lifetime has ended"); diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp index 897ae89365014..3fac952b9eb98 100644 --- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp +++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp @@ -154,14 +154,10 @@ SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be speciali # endif # if TEST_STD_VER >= 23 -SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_UTT(is_scoped_enum); // expected-error 2 {{cannot be specialized}} -# if __has_builtin(__reference_constructs_from_temporary) +SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_scoped_enum); // expected-error 2 {{cannot be specialized}} SPECIALIZE_BTT(reference_constructs_from_temporary); // expected-error 2 {{cannot be specialized}} -# endif -# if __has_builtin(__reference_converts_from_temporary) -SPECIALIZE_BTT(reference_converts_from_temporary); // expected-error 2 {{cannot be specialized}} -# endif +SPECIALIZE_BTT(reference_converts_from_temporary); // expected-error 2 {{cannot be specialized}} # endif # if TEST_STD_VER >= 26 diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp index ad53c8176cc92..84fe7cfb02208 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++23 -// These compilers don't support std::reference_converts_from_temporary yet. -// UNSUPPORTED: apple-clang-16, clang-19.1 - // <type_traits> // template<class T, class U> struct reference_constructs_from_temporary; diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp index 73cc4f3e29d5a..8319d9e1563fe 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++23 -// These compilers don't support std::reference_converts_from_temporary yet. -// UNSUPPORTED: apple-clang-16, clang-19.1 - // <type_traits> // template<class T, class U> struct reference_converts_from_temporary; diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp index 2dfbae9138864..12d778408d5ec 100644 --- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp +++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp @@ -19,11 +19,7 @@ #include "test_macros.h" void test() { - // FreeBSD ci use clang 19.1.1, which hasn't implement __reference_constructs_from_temporary. - // The static_assert inner std::make_from_tuple will not triggered. -#if __has_builtin(__reference_constructs_from_temporary) // expected-error@*:* {{static assertion failed}} -#endif // Turns to an error since C++26 (Disallow Binding a Returned Glvalue to a Temporary https://wg21.link/P2748R5). #if TEST_STD_VER >= 26 From dc3b5141cdf704bec28edeed78d6d72ebe0444ae Mon Sep 17 00:00:00 2001 From: Paul Kirth <paulkirth@google.com> Date: Tue, 4 Nov 2025 15:56:00 -0800 Subject: [PATCH 245/313] [llvm][mustache] Optimize accessor splitting with a single pass (#159198) The splitMustacheString function previously used a loop of StringRef::split and StringRef::trim. This was inefficient as it scanned each segment of the accessor string multiple times. This change introduces a custom splitAndTrim function that performs both operations in a single pass over the string, reducing redundant work and improving performance, most notably in the number of CPU cycles executed. | Metric | Baseline | Optimized | Change | | --- | --- | --- | --- | | Time (ms) | 35\.57 | 35\.36 | \-0.59% | | Cycles | 34\.91M | 34\.26M | \-1.86% | | Instructions | 85\.54M | 85\.24M | \-0.35% | | Branch Misses | 111\.9K | 112\.2K | \+0.27% | | Cache Misses | 242\.1K | 239\.9K | \-0.91% | --- llvm/lib/Support/Mustache.cpp | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 012e1ffd534d2..9eb1ec2b8425c 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -34,6 +34,31 @@ static bool isContextFalsey(const json::Value *V) { return isFalsey(*V); } +static void splitAndTrim(StringRef Str, SmallVectorImpl<StringRef> &Tokens) { + size_t CurrentPos = 0; + while (CurrentPos < Str.size()) { + // Find the next delimiter. + size_t DelimiterPos = Str.find('.', CurrentPos); + + // If no delimiter is found, process the rest of the string. + if (DelimiterPos == StringRef::npos) + DelimiterPos = Str.size(); + + // Get the current part, which may have whitespace. + StringRef Part = Str.slice(CurrentPos, DelimiterPos); + + // Manually trim the part without creating a new string object. + size_t Start = Part.find_first_not_of(" \t\r\n"); + if (Start != StringRef::npos) { + size_t End = Part.find_last_not_of(" \t\r\n"); + Tokens.push_back(Part.slice(Start, End + 1)); + } + + // Move past the delimiter for the next iteration. + CurrentPos = DelimiterPos + 1; + } +} + static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { // We split the mustache string into an accessor. // For example: @@ -46,13 +71,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { // It's a literal, so it doesn't need to be saved. Tokens.push_back("."); } else { - while (!Str.empty()) { - StringRef Part; - std::tie(Part, Str) = Str.split('.'); - // Each part of the accessor needs to be saved to the arena - // to ensure it has a stable address. - Tokens.push_back(Part.trim()); - } + splitAndTrim(Str, Tokens); } // Now, allocate memory for the array of StringRefs in the arena. StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size()); From 2b4ac6629782fc527eb8ef232f6d14d48186a7f4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Tue, 4 Nov 2025 15:57:39 -0800 Subject: [PATCH 246/313] AMDGPU: Cleanup and modernize limit-coalesce.mir test (#166465) --- llvm/test/CodeGen/AMDGPU/limit-coalesce.mir | 63 ++++++--------------- 1 file changed, 18 insertions(+), 45 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir index fa52b96e9ea95..02eda2c4822c2 100644 --- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -6,40 +6,12 @@ # No more registers shall be defined --- name: main -alignment: 1 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false tracksRegLiveness: true registers: - - { id: 1, class: sreg_32_xm0, preferred-register: '%1' } - - { id: 2, class: vreg_64, preferred-register: '%2' } - - { id: 3, class: vreg_64 } - - { id: 4, class: vreg_64 } - - { id: 5, class: vreg_64 } - - { id: 6, class: vreg_96 } - - { id: 7, class: vreg_96 } - - { id: 8, class: vreg_128 } - - { id: 9, class: vreg_128 } -liveins: - - { reg: '$sgpr6', virtual-reg: '%1' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false + - { id: 0, class: sreg_32_xm0, preferred-register: '%0' } + - { id: 1, class: vreg_64, preferred-register: '%1' } body: | - bb.0.entry: + bb.0: liveins: $sgpr0, $vgpr0_vgpr1 ; CHECK-LABEL: name: main @@ -59,20 +31,21 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr - %3 = IMPLICIT_DEF - undef %4.sub0 = COPY $sgpr0 - %4.sub1 = COPY %3.sub0 - undef %5.sub0 = COPY %4.sub1 - %5.sub1 = COPY %4.sub0 - FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, implicit $exec, implicit $flat_scr + %2:vreg_64 = IMPLICIT_DEF + undef %3.sub0:vreg_64 = COPY $sgpr0 + %3.sub1:vreg_64 = COPY %2.sub0 + undef %4.sub0:vreg_64 = COPY %3.sub1 + %4.sub1:vreg_64 = COPY %3.sub0 + FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %4, 0, 0, implicit $exec, implicit $flat_scr - %6 = IMPLICIT_DEF - undef %7.sub0_sub1 = COPY %6 - %7.sub2 = COPY %3.sub0 - FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, implicit $exec, implicit $flat_scr + %5:vreg_96 = IMPLICIT_DEF + undef %6.sub0_sub1:vreg_96 = COPY %5 + %6.sub2:vreg_96 = COPY %2.sub0 + FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %6, 0, 0, implicit $exec, implicit $flat_scr + + %7:vreg_128 = IMPLICIT_DEF + undef %8.sub0_sub1_sub2:vreg_128 = COPY %7 + %8.sub3:vreg_128 = COPY %2.sub0 + FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr - %8 = IMPLICIT_DEF - undef %9.sub0_sub1_sub2 = COPY %8 - %9.sub3 = COPY %3.sub0 - FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, implicit $exec, implicit $flat_scr ... From 0fd029858a819dfbc5e4ec18d456d055359b0317 Mon Sep 17 00:00:00 2001 From: Shubh Pachchigar <33875085+shubhe25p@users.noreply.github.com> Date: Tue, 4 Nov 2025 16:18:53 -0800 Subject: [PATCH 247/313] [libc] Add chown and getgid implementations (#166434) Implements chown and getgid per the POSIX specification and adds corresponding unit tests. getgid is added as it is required by the chown unit tests. This PR will address #165785 Co-authored-by: shubh@DOE <shubhp@mbm3a24.local> --- libc/config/linux/x86_64/entrypoints.txt | 2 + libc/hdr/types/CMakeLists.txt | 8 ++++ libc/hdr/types/gid_t.h | 22 ++++++++++ libc/include/unistd.yaml | 15 +++++++ libc/src/unistd/CMakeLists.txt | 14 +++++++ libc/src/unistd/chown.h | 22 ++++++++++ libc/src/unistd/getgid.h | 22 ++++++++++ libc/src/unistd/linux/CMakeLists.txt | 28 +++++++++++++ libc/src/unistd/linux/chown.cpp | 29 ++++++++++++++ libc/src/unistd/linux/getgid.cpp | 23 +++++++++++ libc/test/src/unistd/CMakeLists.txt | 30 ++++++++++++++ libc/test/src/unistd/chown_test.cpp | 51 ++++++++++++++++++++++++ libc/test/src/unistd/getgid_test.cpp | 15 +++++++ 13 files changed, 281 insertions(+) create mode 100644 libc/hdr/types/gid_t.h create mode 100644 libc/src/unistd/chown.h create mode 100644 libc/src/unistd/getgid.h create mode 100644 libc/src/unistd/linux/chown.cpp create mode 100644 libc/src/unistd/linux/getgid.cpp create mode 100644 libc/test/src/unistd/chown_test.cpp create mode 100644 libc/test/src/unistd/getgid_test.cpp diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 7a8d74a4e5da9..a44e2041e57f2 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -326,6 +326,7 @@ set(TARGET_LIBC_ENTRYPOINTS # unistd.h entrypoints libc.src.unistd.access libc.src.unistd.chdir + libc.src.unistd.chown libc.src.unistd.close libc.src.unistd.dup libc.src.unistd.dup2 @@ -344,6 +345,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.getppid libc.src.unistd.getsid libc.src.unistd.gettid + libc.src.unistd.getgid libc.src.unistd.getuid libc.src.unistd.isatty libc.src.unistd.link diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index 225843924c243..433c47b174766 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -479,3 +479,11 @@ add_proxy_header_library( libc.include.llvm-libc-types.struct_rlimit libc.include.sys_resource ) + +add_proxy_header_library( + gid_t + HDRS + gid_t.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-types.gid_t +) diff --git a/libc/hdr/types/gid_t.h b/libc/hdr/types/gid_t.h new file mode 100644 index 0000000000000..bc274aaa9a8a8 --- /dev/null +++ b/libc/hdr/types/gid_t.h @@ -0,0 +1,22 @@ +//===-- Proxy for gid_t ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_TYPES_GID_T_H +#define LLVM_LIBC_HDR_TYPES_GID_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/gid_t.h" + +#else // Overlay mode + +#include <sys/types.h> + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_GID_T_H diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml index 2ff86eafaf550..0e5b22e627b67 100644 --- a/libc/include/unistd.yaml +++ b/libc/include/unistd.yaml @@ -3,6 +3,7 @@ header_template: unistd.h.def macros: [] types: - type_name: uid_t + - type_name: gid_t - type_name: ssize_t - type_name: size_t - type_name: pid_t @@ -54,6 +55,14 @@ functions: return_type: int arguments: - type: const char * + - name: chown + standards: + - POSIX + return_type: int + arguments: + - type: const char * + - type: uid_t + - type: gid_t - name: close standards: - POSIX @@ -195,6 +204,12 @@ functions: return_type: uid_t arguments: - type: void + - name: getgid + standards: + - POSIX + return_type: gid_t + arguments: + - type: void - name: isatty standards: - POSIX diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt index 78c3bf8442fab..337480cbbf928 100644 --- a/libc/src/unistd/CMakeLists.txt +++ b/libc/src/unistd/CMakeLists.txt @@ -27,6 +27,13 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.chdir ) +add_entrypoint_object( + chown + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.chown +) + add_entrypoint_object( close ALIAS @@ -160,6 +167,13 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.getuid ) +add_entrypoint_object( + getgid + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.getgid +) + add_entrypoint_object( isatty ALIAS diff --git a/libc/src/unistd/chown.h b/libc/src/unistd/chown.h new file mode 100644 index 0000000000000..84a8eba2cb2e6 --- /dev/null +++ b/libc/src/unistd/chown.h @@ -0,0 +1,22 @@ +//===-- Implementation header for chown -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_UNISTD_CHOWN_H +#define LLVM_LIBC_SRC_UNISTD_CHOWN_H + +#include "hdr/types/gid_t.h" +#include "hdr/types/uid_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int chown(const char *path, uid_t owner, gid_t group); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_UNISTD_CHOWN_H diff --git a/libc/src/unistd/getgid.h b/libc/src/unistd/getgid.h new file mode 100644 index 0000000000000..eed0b20d688b1 --- /dev/null +++ b/libc/src/unistd/getgid.h @@ -0,0 +1,22 @@ +//===-- Implementation header for getgid ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_UNISTD_GETGID_H +#define LLVM_LIBC_SRC_UNISTD_GETGID_H + +#include "hdr/types/gid_t.h" +#include "hdr/unistd_macros.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +gid_t getgid(); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_UNISTD_GETGID_H diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt index 4eb3c7d3d7fae..c2dacc6456e27 100644 --- a/libc/src/unistd/linux/CMakeLists.txt +++ b/libc/src/unistd/linux/CMakeLists.txt @@ -25,6 +25,20 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + chown + SRCS + chown.cpp + HDRS + ../chown.h + DEPENDS + libc.hdr.types.uid_t + libc.hdr.types.gid_t + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil + libc.src.errno.errno +) + add_entrypoint_object( close SRCS @@ -276,6 +290,20 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + getgid + SRCS + getgid.cpp + HDRS + ../getgid.h + DEPENDS + libc.hdr.types.gid_t + libc.hdr.fcntl_macros + libc.include.unistd + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil +) + add_entrypoint_object( getuid SRCS diff --git a/libc/src/unistd/linux/chown.cpp b/libc/src/unistd/linux/chown.cpp new file mode 100644 index 0000000000000..c7bf1703ffe57 --- /dev/null +++ b/libc/src/unistd/linux/chown.cpp @@ -0,0 +1,29 @@ +//===-- Linux implementation of chown -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/chown.h" + +#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/common.h" + +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include <sys/syscall.h> // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, chown, (const char *path, uid_t owner, gid_t group)) { + int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_chown, path, owner, group); + if (ret < 0) { + libc_errno = -ret; + return -1; + } + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/getgid.cpp b/libc/src/unistd/linux/getgid.cpp new file mode 100644 index 0000000000000..1656fd601d843 --- /dev/null +++ b/libc/src/unistd/linux/getgid.cpp @@ -0,0 +1,23 @@ +//===-- Linux implementation of getgid ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/getgid.h" + +#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +#include <sys/syscall.h> // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(gid_t, getgid, ()) { + return LIBC_NAMESPACE::syscall_impl<gid_t>(SYS_getgid); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index 44f28fff9ad39..07070535459ec 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -36,6 +36,26 @@ add_libc_unittest( libc.test.UnitTest.ErrnoSetterMatcher ) +add_libc_unittest( + chown_test + SUITE + libc_unistd_unittests + SRCS + chown_test.cpp + DEPENDS + libc.hdr.fcntl_macros + libc.include.unistd + libc.src.errno.errno + libc.src.unistd.chown + libc.src.unistd.close + libc.src.unistd.unlink + libc.src.fcntl.open + libc.src.unistd.getuid + libc.src.unistd.getgid + libc.test.UnitTest.ErrnoCheckingTest + libc.test.UnitTest.ErrnoSetterMatcher +) + add_libc_unittest( dup_test SUITE @@ -437,6 +457,16 @@ add_libc_unittest( libc.test.UnitTest.ErrnoCheckingTest ) +add_libc_unittest( + getgid_test + SUITE + libc_unistd_unittests + SRCS + getgid_test.cpp + DEPENDS + libc.src.unistd.getgid +) + add_libc_unittest( getpid_test SUITE diff --git a/libc/test/src/unistd/chown_test.cpp b/libc/test/src/unistd/chown_test.cpp new file mode 100644 index 0000000000000..8b1f783273624 --- /dev/null +++ b/libc/test/src/unistd/chown_test.cpp @@ -0,0 +1,51 @@ +//===-- Unittests for chown -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/fcntl/open.h" +#include "src/unistd/chown.h" +#include "src/unistd/close.h" +#include "src/unistd/getgid.h" +#include "src/unistd/getuid.h" +#include "src/unistd/unlink.h" + +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/Test.h" + +#include "hdr/fcntl_macros.h" +#include <sys/stat.h> + +using LlvmLibcChownTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcChownTest, ChownSuccess) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + uid_t my_uid = LIBC_NAMESPACE::getuid(); + gid_t my_gid = LIBC_NAMESPACE::getgid(); + constexpr const char *FILENAME = "chown.test"; + auto TEST_FILE = libc_make_test_file_path(FILENAME); + + // Create a test file. + int write_fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(write_fd, 0); + + // Change the ownership of the file. + ASSERT_THAT(LIBC_NAMESPACE::chown(TEST_FILE, my_uid, my_gid), Succeeds(0)); + + // Close the file descriptor. + ASSERT_THAT(LIBC_NAMESPACE::close(write_fd), Succeeds(0)); + + // Clean up the test file. + ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0)); +} + +TEST_F(LlvmLibcChownTest, ChownNonExistentFile) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; + ASSERT_THAT(LIBC_NAMESPACE::chown("non-existent-file", 1000, 1000), + Fails(ENOENT)); +} diff --git a/libc/test/src/unistd/getgid_test.cpp b/libc/test/src/unistd/getgid_test.cpp new file mode 100644 index 0000000000000..77dbad2f18e00 --- /dev/null +++ b/libc/test/src/unistd/getgid_test.cpp @@ -0,0 +1,15 @@ +//===-- Unittests for getgid ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/getgid.h" +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcGetGidTest, SmokeTest) { + // getgid always succeeds. So, we just call it as a smoke test. + LIBC_NAMESPACE::getgid(); +} From 8648beff758db89e2eb816bd01d9b3c37e6aa3f9 Mon Sep 17 00:00:00 2001 From: Wenju He <wenju.he@intel.com> Date: Wed, 5 Nov 2025 08:20:29 +0800 Subject: [PATCH 248/313] [NFC][libclc] Rename clc_cbrt.inc to clc_cbrt.h (#166330) __clc_cbrt functions are declared in clc_cbrt.inc. Rename to .h for consistency with other headers. --- libclc/clc/include/clc/math/{clc_cbrt.inc => clc_cbrt.h} | 0 libclc/clc/lib/generic/math/clc_cbrt.cl | 1 + libclc/opencl/lib/generic/math/cbrt.cl | 2 +- 3 files changed, 2 insertions(+), 1 deletion(-) rename libclc/clc/include/clc/math/{clc_cbrt.inc => clc_cbrt.h} (100%) diff --git a/libclc/clc/include/clc/math/clc_cbrt.inc b/libclc/clc/include/clc/math/clc_cbrt.h similarity index 100% rename from libclc/clc/include/clc/math/clc_cbrt.inc rename to libclc/clc/include/clc/math/clc_cbrt.h diff --git a/libclc/clc/lib/generic/math/clc_cbrt.cl b/libclc/clc/lib/generic/math/clc_cbrt.cl index 105f6329d5bad..935b7b7eae78c 100644 --- a/libclc/clc/lib/generic/math/clc_cbrt.cl +++ b/libclc/clc/lib/generic/math/clc_cbrt.cl @@ -8,6 +8,7 @@ #include <clc/clc_convert.h> #include <clc/internal/clc.h> +#include <clc/math/clc_cbrt.h> #include <clc/math/clc_copysign.h> #include <clc/math/clc_fabs.h> #include <clc/math/clc_fma.h> diff --git a/libclc/opencl/lib/generic/math/cbrt.cl b/libclc/opencl/lib/generic/math/cbrt.cl index 0d670150ed4c9..7de61436522b3 100644 --- a/libclc/opencl/lib/generic/math/cbrt.cl +++ b/libclc/opencl/lib/generic/math/cbrt.cl @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include <clc/math/clc_cbrt.inc> +#include <clc/math/clc_cbrt.h> #include <clc/opencl/math/cbrt.h> #define __CLC_FUNCTION cbrt From d8e56988f87ecdfa0b9bfdb269f6d904840cc6dd Mon Sep 17 00:00:00 2001 From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com> Date: Wed, 5 Nov 2025 01:20:51 +0100 Subject: [PATCH 249/313] [libc] Add printf error handling (with fixes) (#166382) https://github.com/llvm/llvm-project/issues/159474 Resubmitting https://github.com/llvm/llvm-project/pull/162876 with fixes as it broke some buildbots: - Fix comparisons of integer expressions of different signedness - Not check for specific errnos in tests, as they might not be available on all platforms --- libc/src/stdio/CMakeLists.txt | 24 +++++++++ libc/src/stdio/asprintf.cpp | 18 ++++++- libc/src/stdio/baremetal/CMakeLists.txt | 8 +++ libc/src/stdio/baremetal/printf.cpp | 23 ++++++-- libc/src/stdio/baremetal/vprintf.cpp | 23 ++++++-- libc/src/stdio/generic/CMakeLists.txt | 4 ++ libc/src/stdio/generic/fprintf.cpp | 17 +++++- libc/src/stdio/generic/printf.cpp | 17 +++++- libc/src/stdio/generic/vfprintf.cpp | 17 +++++- libc/src/stdio/generic/vprintf.cpp | 17 +++++- libc/src/stdio/printf_core/CMakeLists.txt | 25 +++++++++ libc/src/stdio/printf_core/core_structs.h | 19 ++++--- libc/src/stdio/printf_core/error_mapper.h | 21 ++++++++ .../stdio/printf_core/generic/CMakeLists.txt | 8 +++ .../stdio/printf_core/generic/error_mapper.h | 49 +++++++++++++++++ .../stdio/printf_core/linux/CMakeLists.txt | 8 +++ .../stdio/printf_core/linux/error_mapper.h | 54 +++++++++++++++++++ libc/src/stdio/printf_core/printf_main.h | 9 ++-- .../stdio/printf_core/vasprintf_internal.h | 20 +++---- .../src/stdio/printf_core/vfprintf_internal.h | 41 +++++++++----- .../stdio/printf_core/write_int_converter.h | 4 +- libc/src/stdio/printf_core/writer.h | 8 +-- libc/src/stdio/snprintf.cpp | 19 ++++++- libc/src/stdio/sprintf.cpp | 18 ++++++- libc/src/stdio/vasprintf.cpp | 16 +++++- libc/src/stdio/vsnprintf.cpp | 19 ++++++- libc/src/stdio/vsprintf.cpp | 17 +++++- libc/src/stdlib/CMakeLists.txt | 6 +++ libc/src/stdlib/strfromd.cpp | 11 +++- libc/src/stdlib/strfromf.cpp | 11 +++- libc/src/stdlib/strfroml.cpp | 11 +++- libc/src/time/strftime_core/strftime_main.h | 3 +- libc/test/src/stdio/CMakeLists.txt | 2 + libc/test/src/stdio/fprintf_test.cpp | 24 +++++++++ .../src/stdio/printf_core/converter_test.cpp | 30 +++++------ .../src/stdio/printf_core/writer_test.cpp | 32 +++++------ libc/test/src/stdio/snprintf_test.cpp | 15 ++++++ libc/test/src/stdio/vfprintf_test.cpp | 5 ++ libc/test/src/stdlib/StrfromTest.h | 19 ++++++- 39 files changed, 588 insertions(+), 104 deletions(-) create mode 100644 libc/src/stdio/printf_core/error_mapper.h create mode 100644 libc/src/stdio/printf_core/generic/CMakeLists.txt create mode 100644 libc/src/stdio/printf_core/generic/error_mapper.h create mode 100644 libc/src/stdio/printf_core/linux/CMakeLists.txt create mode 100644 libc/src/stdio/printf_core/linux/error_mapper.h diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt index b0a6ef1e291b5..c75c8b11be2b5 100644 --- a/libc/src/stdio/CMakeLists.txt +++ b/libc/src/stdio/CMakeLists.txt @@ -125,6 +125,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -136,6 +140,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -146,6 +154,10 @@ add_entrypoint_object( asprintf.h DEPENDS libc.src.stdio.printf_core.vasprintf_internal + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -157,6 +169,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -168,6 +184,10 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -178,6 +198,10 @@ add_entrypoint_object( vasprintf.h DEPENDS libc.src.stdio.printf_core.vasprintf_internal + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_subdirectory(printf_core) diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp index f8cfb74ce48ea..0991dfca6a059 100644 --- a/libc/src/stdio/asprintf.cpp +++ b/libc/src/stdio/asprintf.cpp @@ -7,8 +7,12 @@ //===----------------------------------------------------------------------===// #include "src/stdio/asprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vasprintf_internal.h" namespace LIBC_NAMESPACE_DECL { @@ -22,8 +26,18 @@ LLVM_LIBC_FUNCTION(int, asprintf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - int ret = printf_core::vasprintf_internal(buffer, format, args); - return ret; + auto ret_val = printf_core::vasprintf_internal(buffer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt index 548938f885c94..bfeff0e2b5880 100644 --- a/libc/src/stdio/baremetal/CMakeLists.txt +++ b/libc/src/stdio/baremetal/CMakeLists.txt @@ -29,8 +29,12 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.error_mapper + libc.src.stdio.printf_core.core_structs libc.src.__support.arg_list libc.src.__support.OSUtil.osutil + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -87,8 +91,12 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer + libc.src.stdio.printf_core.error_mapper + libc.src.stdio.printf_core.core_structs libc.src.__support.arg_list libc.src.__support.OSUtil.osutil + libc.src.__support.libc_errno + libc.src.__support.CPP.limits ) add_entrypoint_object( diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp index 7253c6549a4e4..5a9b19ff20471 100644 --- a/libc/src/stdio/baremetal/printf.cpp +++ b/libc/src/stdio/baremetal/printf.cpp @@ -7,10 +7,13 @@ //===----------------------------------------------------------------------===// #include "src/stdio/printf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/io.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -42,13 +45,25 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) { buffer, BUFF_SIZE, &stdout_write_hook, nullptr); printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb); - int retval = printf_core::printf_main(&writer, format, args); + auto retval = printf_core::printf_main(&writer, format, args); + if (!retval.has_value()) { + libc_errno = printf_core::internal_error_to_errno(retval.error()); + return -1; + } int flushval = wb.overflow_write(""); - if (flushval != printf_core::WRITE_OK) - retval = flushval; + if (flushval != printf_core::WRITE_OK) { + libc_errno = printf_core::internal_error_to_errno(-flushval); + return -1; + } - return retval; + if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(retval.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp index ab02533f14911..c172b368d15f3 100644 --- a/libc/src/stdio/baremetal/vprintf.cpp +++ b/libc/src/stdio/baremetal/vprintf.cpp @@ -7,10 +7,13 @@ //===----------------------------------------------------------------------===// #include "src/stdio/vprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/io.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -40,13 +43,25 @@ LLVM_LIBC_FUNCTION(int, vprintf, buffer, BUFF_SIZE, &stdout_write_hook, nullptr); printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb); - int retval = printf_core::printf_main(&writer, format, args); + auto retval = printf_core::printf_main(&writer, format, args); + if (!retval.has_value()) { + libc_errno = printf_core::internal_error_to_errno(retval.error()); + return -1; + } int flushval = wb.overflow_write(""); - if (flushval != printf_core::WRITE_OK) - retval = flushval; + if (flushval != printf_core::WRITE_OK) { + libc_errno = printf_core::internal_error_to_errno(-flushval); + return -1; + } - return retval; + if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(retval.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt index 6361822b61999..71055edea3d9e 100644 --- a/libc/src/stdio/generic/CMakeLists.txt +++ b/libc/src/stdio/generic/CMakeLists.txt @@ -393,7 +393,11 @@ add_generic_entrypoint_object( list(APPEND fprintf_deps libc.hdr.types.FILE libc.src.__support.arg_list + libc.src.__support.CPP.limits + libc.src.__support.libc_errno libc.src.stdio.printf_core.vfprintf_internal + libc.src.stdio.printf_core.core_structs + libc.src.stdio.printf_core.error_mapper ) if(LLVM_LIBC_FULL_BUILD) diff --git a/libc/src/stdio/generic/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp index 087aeadfc52c5..b2033901557a0 100644 --- a/libc/src/stdio/generic/fprintf.cpp +++ b/libc/src/stdio/generic/fprintf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/fprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -27,8 +30,18 @@ LLVM_LIBC_FUNCTION(int, fprintf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - int ret_val = printf_core::vfprintf_internal(stream, format, args); - return ret_val; + auto ret_val = printf_core::vfprintf_internal(stream, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/printf.cpp b/libc/src/stdio/generic/printf.cpp index bb7c7c86f843f..8d159d5c70870 100644 --- a/libc/src/stdio/generic/printf.cpp +++ b/libc/src/stdio/generic/printf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/printf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -31,9 +34,19 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) { // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - int ret_val = printf_core::vfprintf_internal( + auto ret_val = printf_core::vfprintf_internal( reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args); - return ret_val; + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp index 01f4265f118a6..a26f082ed9347 100644 --- a/libc/src/stdio/generic/vfprintf.cpp +++ b/libc/src/stdio/generic/vfprintf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/vfprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -24,8 +27,18 @@ LLVM_LIBC_FUNCTION(int, vfprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - int ret_val = printf_core::vfprintf_internal(stream, format, args); - return ret_val; + auto ret_val = printf_core::vfprintf_internal(stream, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp index 08d71515646ed..ae2160219f2bb 100644 --- a/libc/src/stdio/generic/vprintf.cpp +++ b/libc/src/stdio/generic/vprintf.cpp @@ -8,9 +8,12 @@ #include "src/stdio/vprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -29,9 +32,19 @@ LLVM_LIBC_FUNCTION(int, vprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - int ret_val = printf_core::vfprintf_internal( + auto ret_val = printf_core::vfprintf_internal( reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args); - return ret_val; + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt index ee66145e60156..624129b2b36e7 100644 --- a/libc/src/stdio/printf_core/CMakeLists.txt +++ b/libc/src/stdio/printf_core/CMakeLists.txt @@ -32,6 +32,17 @@ if(printf_config_copts) list(PREPEND printf_config_copts "COMPILE_OPTIONS") endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) + add_subdirectory(${LIBC_TARGET_OS}) +else() + add_subdirectory(generic) +endif() + +set(target_error_mapper libc.src.stdio.printf_core.${LIBC_TARGET_OS}.error_mapper) +if(NOT TARGET ${target_error_mapper}) + set(target_error_mapper libc.src.stdio.printf_core.generic.error_mapper) +endif() + add_header_library( printf_config HDRS @@ -47,6 +58,7 @@ add_header_library( libc.include.inttypes libc.src.__support.CPP.string_view libc.src.__support.FPUtil.fp_bits + libc.hdr.errno_macros ) add_header_library( @@ -125,6 +137,7 @@ add_header_library( .writer .core_structs libc.src.__support.arg_list + libc.src.__support.error_or ) add_header_library( @@ -136,10 +149,20 @@ add_header_library( libc.hdr.func.free libc.hdr.func.realloc libc.src.__support.arg_list + libc.src.__support.error_or libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer ) +add_header_library( + error_mapper + HDRS + error_mapper.h + DEPENDS + ${target_error_mapper} + libc.src.__support.macros.properties.architectures +) + if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD) # Not all platforms have a file implementation. If file is unvailable, and a # full build is requested, then we must skip all file based printf sections. @@ -152,8 +175,10 @@ add_header_library( vfprintf_internal.h DEPENDS libc.src.__support.File.file + libc.src.__support.error_or libc.src.__support.arg_list libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer ${use_system_file} ) + diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h index e27f77b6b594a..0d41f2244d8da 100644 --- a/libc/src/stdio/printf_core/core_structs.h +++ b/libc/src/stdio/printf_core/core_structs.h @@ -132,14 +132,17 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() { // This is the value to be returned by conversions when no error has occurred. constexpr int WRITE_OK = 0; -// These are the printf return values for when an error has occurred. They are -// all negative, and should be distinct. -constexpr int FILE_WRITE_ERROR = -1; -constexpr int FILE_STATUS_ERROR = -2; -constexpr int NULLPTR_WRITE_ERROR = -3; -constexpr int INT_CONVERSION_ERROR = -4; -constexpr int FIXED_POINT_CONVERSION_ERROR = -5; -constexpr int ALLOCATION_ERROR = -6; +// These are the error return values used by the printf engine when an +// error has occurred. They are all large negative, distinct values starting +// from -1000 to not overlap with system errors. +constexpr int FILE_WRITE_ERROR = -1001; +constexpr int FILE_STATUS_ERROR = -1002; +constexpr int NULLPTR_WRITE_ERROR = -1003; +constexpr int INT_CONVERSION_ERROR = -1004; +constexpr int FIXED_POINT_CONVERSION_ERROR = -1005; +constexpr int ALLOCATION_ERROR = -1006; +constexpr int OVERFLOW_ERROR = -1007; + } // namespace printf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/printf_core/error_mapper.h b/libc/src/stdio/printf_core/error_mapper.h new file mode 100644 index 0000000000000..23030930133a1 --- /dev/null +++ b/libc/src/stdio/printf_core/error_mapper.h @@ -0,0 +1,21 @@ +//===-- Error mapper for printf ---------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H + +#include "src/__support/macros/properties/architectures.h" + +// Maps internal errors to the available errnos on the platform. +#if defined(__linux__) +#include "linux/error_mapper.h" +#else +#include "generic/error_mapper.h" +#endif + +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/generic/CMakeLists.txt b/libc/src/stdio/printf_core/generic/CMakeLists.txt new file mode 100644 index 0000000000000..2f0143d992e31 --- /dev/null +++ b/libc/src/stdio/printf_core/generic/CMakeLists.txt @@ -0,0 +1,8 @@ +add_header_library( + error_mapper + HDRS + error_mapper.h + DEPENDS + libc.src.stdio.printf_core.core_structs + libc.hdr.errno_macros +) diff --git a/libc/src/stdio/printf_core/generic/error_mapper.h b/libc/src/stdio/printf_core/generic/error_mapper.h new file mode 100644 index 0000000000000..d8cdd2cc2dbaa --- /dev/null +++ b/libc/src/stdio/printf_core/generic/error_mapper.h @@ -0,0 +1,49 @@ +//===-- Generic implementation of error mapper ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H + +#include "hdr/errno_macros.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" + +namespace LIBC_NAMESPACE_DECL { +namespace printf_core { + +LIBC_INLINE static int internal_error_to_errno(int internal_error) { + // System error occured, return error as is. + if (internal_error < 1001 && internal_error > 0) { + return internal_error; + } + + // Map internal error to the available C standard errnos. + switch (-internal_error) { + case WRITE_OK: + return 0; + case FILE_WRITE_ERROR: + case FILE_STATUS_ERROR: + case NULLPTR_WRITE_ERROR: + case ALLOCATION_ERROR: + return EDOM; + case INT_CONVERSION_ERROR: + case FIXED_POINT_CONVERSION_ERROR: + case OVERFLOW_ERROR: + return ERANGE; + default: + LIBC_ASSERT( + false && + "Invalid internal printf error code passed to internal_error_to_errno"); + return EDOM; + } +} + +} // namespace printf_core +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/linux/CMakeLists.txt b/libc/src/stdio/printf_core/linux/CMakeLists.txt new file mode 100644 index 0000000000000..2f0143d992e31 --- /dev/null +++ b/libc/src/stdio/printf_core/linux/CMakeLists.txt @@ -0,0 +1,8 @@ +add_header_library( + error_mapper + HDRS + error_mapper.h + DEPENDS + libc.src.stdio.printf_core.core_structs + libc.hdr.errno_macros +) diff --git a/libc/src/stdio/printf_core/linux/error_mapper.h b/libc/src/stdio/printf_core/linux/error_mapper.h new file mode 100644 index 0000000000000..3c2fe663072d0 --- /dev/null +++ b/libc/src/stdio/printf_core/linux/error_mapper.h @@ -0,0 +1,54 @@ +//===-- Linux implementation of error mapper --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H + +#include "hdr/errno_macros.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" + +namespace LIBC_NAMESPACE_DECL { +namespace printf_core { + +LIBC_INLINE static int internal_error_to_errno(int internal_error) { + // System error occured, return error as is. + if (internal_error < 1001 && internal_error > 0) { + return internal_error; + } + + // Map internal error to POSIX errnos. + switch (-internal_error) { + case WRITE_OK: + return 0; + case FILE_WRITE_ERROR: + return EIO; + case FILE_STATUS_ERROR: + return EIO; + case NULLPTR_WRITE_ERROR: + return EINVAL; + case INT_CONVERSION_ERROR: + return ERANGE; + case FIXED_POINT_CONVERSION_ERROR: + return EINVAL; + case ALLOCATION_ERROR: + return ENOMEM; + case OVERFLOW_ERROR: + return EOVERFLOW; + default: + LIBC_ASSERT( + false && + "Invalid internal printf error code passed to internal_error_to_errno"); + return EINVAL; + } +} + +} // namespace printf_core +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/printf_main.h b/libc/src/stdio/printf_core/printf_main.h index 57f29858d5298..1c7a7237c097d 100644 --- a/libc/src/stdio/printf_core/printf_main.h +++ b/libc/src/stdio/printf_core/printf_main.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PRINTF_MAIN_H #include "src/__support/arg_list.h" +#include "src/__support/error_or.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/converter.h" #include "src/stdio/printf_core/core_structs.h" @@ -22,8 +23,9 @@ namespace LIBC_NAMESPACE_DECL { namespace printf_core { template <WriteMode write_mode> -int printf_main(Writer<write_mode> *writer, const char *__restrict str, - internal::ArgList &args) { +ErrorOr<size_t> printf_main(Writer<write_mode> *writer, + const char *__restrict str, + internal::ArgList &args) { Parser<internal::ArgList> parser(str, args); int result = 0; for (FormatSection cur_section = parser.get_next_section(); @@ -33,9 +35,8 @@ int printf_main(Writer<write_mode> *writer, const char *__restrict str, result = convert(writer, cur_section); else result = writer->write(cur_section.raw_string); - if (result < 0) - return result; + return Error(-result); } return writer->get_chars_written(); diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h index 283d8df2810fb..41df17b67f35b 100644 --- a/libc/src/stdio/printf_core/vasprintf_internal.h +++ b/libc/src/stdio/printf_core/vasprintf_internal.h @@ -10,6 +10,7 @@ #include "hdr/func/malloc.h" #include "hdr/func/realloc.h" #include "src/__support/arg_list.h" +#include "src/__support/error_or.h" #include "src/stdio/printf_core/core_structs.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -29,7 +30,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { if (new_buff == nullptr) { if (wb->buff != wb->init_buff) free(wb->buff); - return printf_core::ALLOCATION_ERROR; + return ALLOCATION_ERROR; } if (isBuffOnStack) inline_memcpy(new_buff, wb->buff, wb->buff_cur); @@ -42,27 +43,28 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { constexpr size_t DEFAULT_BUFFER_SIZE = 200; -LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format, - internal::ArgList args) { +LIBC_INLINE ErrorOr<size_t> vasprintf_internal(char **ret, + const char *__restrict format, + internal::ArgList args) { char init_buff_on_stack[DEFAULT_BUFFER_SIZE]; printf_core::WriteBuffer<Mode<WriteMode::RESIZE_AND_FILL_BUFF>::value> wb( init_buff_on_stack, DEFAULT_BUFFER_SIZE, resize_overflow_hook); printf_core::Writer writer(wb); auto ret_val = printf_core::printf_main(&writer, format, args); - if (ret_val < 0) { + if (!ret_val.has_value()) { *ret = nullptr; - return -1; + return ret_val; } if (wb.buff == init_buff_on_stack) { - *ret = static_cast<char *>(malloc(ret_val + 1)); + *ret = static_cast<char *>(malloc(ret_val.value() + 1)); if (ret == nullptr) - return printf_core::ALLOCATION_ERROR; - inline_memcpy(*ret, wb.buff, ret_val); + return Error(ALLOCATION_ERROR); + inline_memcpy(*ret, wb.buff, ret_val.value()); } else { *ret = wb.buff; } - (*ret)[ret_val] = '\0'; + (*ret)[ret_val.value()] = '\0'; return ret_val; } } // namespace printf_core diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h index 630de9d9d43dd..564441d3bf51a 100644 --- a/libc/src/stdio/printf_core/vfprintf_internal.h +++ b/libc/src/stdio/printf_core/vfprintf_internal.h @@ -11,6 +11,7 @@ #include "src/__support/File/file.h" #include "src/__support/arg_list.h" +#include "src/__support/error_or.h" #include "src/__support/macros/attributes.h" // For LIBC_INLINE #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" @@ -35,8 +36,8 @@ LIBC_INLINE void funlockfile(FILE *f) { reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock(); } -LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, - FILE *f) { +LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, + size_t nmemb, FILE *f) { return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->write_unlocked( ptr, size * nmemb); } @@ -47,9 +48,11 @@ LIBC_INLINE void flockfile(::FILE *f) { ::flockfile(f); } LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); } -LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, - ::FILE *f) { - return ::fwrite_unlocked(ptr, size, nmemb, f); +LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, + size_t nmemb, ::FILE *f) { + // Need to use system errno in this case, as system write will set this errno + // which we need to propagate back into our code. + return {::fwrite_unlocked(ptr, size, nmemb, f), errno}; } #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace internal @@ -60,26 +63,38 @@ LIBC_INLINE int file_write_hook(cpp::string_view new_str, void *fp) { ::FILE *target_file = reinterpret_cast<::FILE *>(fp); // Write new_str to the target file. The logic preventing a zero-length write // is in the writer, so we don't check here. - size_t written = internal::fwrite_unlocked(new_str.data(), sizeof(char), - new_str.size(), target_file); - if (written != new_str.size() || internal::ferror_unlocked(target_file)) + auto write_result = internal::fwrite_unlocked(new_str.data(), sizeof(char), + new_str.size(), target_file); + // Propagate actual system error in FileIOResult. + if (write_result.has_error()) + return -write_result.error; + + // In case short write occured or error was not set on FileIOResult for some + // reason. + if (write_result.value != new_str.size() || + internal::ferror_unlocked(target_file)) return FILE_WRITE_ERROR; + return WRITE_OK; } -LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream, - const char *__restrict format, - internal::ArgList &args) { +LIBC_INLINE ErrorOr<size_t> vfprintf_internal(::FILE *__restrict stream, + const char *__restrict format, + internal::ArgList &args) { constexpr size_t BUFF_SIZE = 1024; char buffer[BUFF_SIZE]; printf_core::WriteBuffer<Mode<WriteMode::FLUSH_TO_STREAM>::value> wb( buffer, BUFF_SIZE, &file_write_hook, reinterpret_cast<void *>(stream)); Writer writer(wb); internal::flockfile(stream); - int retval = printf_main(&writer, format, args); + auto retval = printf_main(&writer, format, args); + if (!retval.has_value()) { + internal::funlockfile(stream); + return retval; + } int flushval = wb.overflow_write(""); if (flushval != WRITE_OK) - retval = flushval; + retval = Error(-flushval); internal::funlockfile(stream); return retval; } diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h index efcff278bd284..04b2bef05bc7b 100644 --- a/libc/src/stdio/printf_core/write_int_converter.h +++ b/libc/src/stdio/printf_core/write_int_converter.h @@ -29,11 +29,11 @@ LIBC_INLINE int convert_write_int(Writer<write_mode> *writer, return NULLPTR_WRITE_ERROR; #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS - int written = writer->get_chars_written(); + size_t written = writer->get_chars_written(); switch (to_conv.length_modifier) { case LengthModifier::none: - *reinterpret_cast<int *>(to_conv.conv_val_ptr) = written; + *reinterpret_cast<int *>(to_conv.conv_val_ptr) = static_cast<int>(written); break; case LengthModifier::l: *reinterpret_cast<long *>(to_conv.conv_val_ptr) = written; diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h index 1d4734a51b9b8..9de108ece510f 100644 --- a/libc/src/stdio/printf_core/writer.h +++ b/libc/src/stdio/printf_core/writer.h @@ -127,7 +127,7 @@ template <WriteMode write_mode> struct WriteBuffer { template <WriteMode write_mode> class Writer final { WriteBuffer<write_mode> &wb; - int chars_written = 0; + size_t chars_written = 0; LIBC_INLINE int pad(char new_char, size_t length) { // First, fill as much of the buffer as possible with the padding char. @@ -161,7 +161,7 @@ template <WriteMode write_mode> class Writer final { // Takes a string, copies it into the buffer if there is space, else passes it // to the overflow mechanism to be handled separately. LIBC_INLINE int write(cpp::string_view new_string) { - chars_written += static_cast<int>(new_string.size()); + chars_written += new_string.size(); if (LIBC_LIKELY(wb.buff_cur + new_string.size() <= wb.buff_len)) { inline_memcpy(wb.buff + wb.buff_cur, new_string.data(), new_string.size()); @@ -175,7 +175,7 @@ template <WriteMode write_mode> class Writer final { // if there is space, else calls pad which will loop and call the overflow // mechanism on a secondary buffer. LIBC_INLINE int write(char new_char, size_t length) { - chars_written += static_cast<int>(length); + chars_written += length; if (LIBC_LIKELY(wb.buff_cur + length <= wb.buff_len)) { inline_memset(wb.buff + wb.buff_cur, static_cast<unsigned char>(new_char), @@ -199,7 +199,7 @@ template <WriteMode write_mode> class Writer final { return wb.overflow_write(char_string_view); } - LIBC_INLINE int get_chars_written() { return chars_written; } + LIBC_INLINE size_t get_chars_written() { return chars_written; } }; // Class-template auto deduction helpers. diff --git a/libc/src/stdio/snprintf.cpp b/libc/src/stdio/snprintf.cpp index c8940862f711f..d95195f6f485f 100644 --- a/libc/src/stdio/snprintf.cpp +++ b/libc/src/stdio/snprintf.cpp @@ -8,8 +8,12 @@ #include "src/stdio/snprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -32,10 +36,21 @@ LLVM_LIBC_FUNCTION(int, snprintf, wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/sprintf.cpp b/libc/src/stdio/sprintf.cpp index 7be97d3591aaf..2a9b6ea7c5e50 100644 --- a/libc/src/stdio/sprintf.cpp +++ b/libc/src/stdio/sprintf.cpp @@ -10,7 +10,10 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -33,9 +36,20 @@ LLVM_LIBC_FUNCTION(int, sprintf, wb(buffer, cpp::numeric_limits<size_t>::max()); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp index 4a44d4a0f8842..bd77cd8864312 100644 --- a/libc/src/stdio/vasprintf.cpp +++ b/libc/src/stdio/vasprintf.cpp @@ -7,7 +7,11 @@ //===----------------------------------------------------------------------===// #include "src/stdio/vasprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vasprintf_internal.h" namespace LIBC_NAMESPACE_DECL { @@ -18,7 +22,17 @@ LLVM_LIBC_FUNCTION(int, vasprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - return printf_core::vasprintf_internal(ret, format, args); + auto ret_val = printf_core::vasprintf_internal(ret, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vsnprintf.cpp b/libc/src/stdio/vsnprintf.cpp index b07a2499a0dd3..5d936360c0857 100644 --- a/libc/src/stdio/vsnprintf.cpp +++ b/libc/src/stdio/vsnprintf.cpp @@ -8,8 +8,12 @@ #include "src/stdio/vsnprintf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -29,10 +33,21 @@ LLVM_LIBC_FUNCTION(int, vsnprintf, wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vsprintf.cpp b/libc/src/stdio/vsprintf.cpp index 26d497be42125..f9cf8118534f6 100644 --- a/libc/src/stdio/vsprintf.cpp +++ b/libc/src/stdio/vsprintf.cpp @@ -10,7 +10,10 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -30,9 +33,19 @@ LLVM_LIBC_FUNCTION(int, vsprintf, wb(buffer, cpp::numeric_limits<size_t>::max()); printf_core::Writer writer(wb); - int ret_val = printf_core::printf_main(&writer, format, args); + auto ret_val = printf_core::printf_main(&writer, format, args); + if (!ret_val.has_value()) { + libc_errno = printf_core::internal_error_to_errno(ret_val.error()); + return -1; + } wb.buff[wb.buff_cur] = '\0'; - return ret_val; + + if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(ret_val.value()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index c464f82dcbda7..1ccdcc8bec148 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -73,6 +73,8 @@ add_entrypoint_object( strfromf.h DEPENDS .str_from_util + libc.src.__support.CPP.limits + libc.src.stdio.printf_core.error_mapper ) add_entrypoint_object( @@ -83,6 +85,8 @@ add_entrypoint_object( strfromd.h DEPENDS .str_from_util + libc.src.__support.CPP.limits + libc.src.stdio.printf_core.error_mapper ) add_entrypoint_object( @@ -93,6 +97,8 @@ add_entrypoint_object( strfroml.h DEPENDS .str_from_util + libc.src.__support.CPP.limits + libc.src.stdio.printf_core.error_mapper ) add_header_library( diff --git a/libc/src/stdlib/strfromd.cpp b/libc/src/stdlib/strfromd.cpp index f51e6d4c7f1df..71e257f08645b 100644 --- a/libc/src/stdlib/strfromd.cpp +++ b/libc/src/stdlib/strfromd.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfromd.h" +#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -36,7 +39,13 @@ LLVM_LIBC_FUNCTION(int, strfromd, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - return writer.get_chars_written(); + if (writer.get_chars_written() > + static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(writer.get_chars_written()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strfromf.cpp b/libc/src/stdlib/strfromf.cpp index 14dbfdb25bab6..65f242b200f18 100644 --- a/libc/src/stdlib/strfromf.cpp +++ b/libc/src/stdlib/strfromf.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfromf.h" +#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -36,7 +39,13 @@ LLVM_LIBC_FUNCTION(int, strfromf, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - return writer.get_chars_written(); + if (writer.get_chars_written() > + static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(writer.get_chars_written()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strfroml.cpp b/libc/src/stdlib/strfroml.cpp index 12f22a8a2fb65..31668a0323c93 100644 --- a/libc/src/stdlib/strfroml.cpp +++ b/libc/src/stdlib/strfroml.cpp @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfroml.h" +#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" +#include "src/stdio/printf_core/core_structs.h" +#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -41,7 +44,13 @@ LLVM_LIBC_FUNCTION(int, strfroml, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - return writer.get_chars_written(); + if (writer.get_chars_written() > + static_cast<size_t>(cpp::numeric_limits<int>::max())) { + libc_errno = + printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); + return -1; + } + return static_cast<int>(writer.get_chars_written()); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h index c7e590627094a..2b136d83234cd 100644 --- a/libc/src/time/strftime_core/strftime_main.h +++ b/libc/src/time/strftime_core/strftime_main.h @@ -36,7 +36,8 @@ int strftime_main(printf_core::Writer<write_mode> *writer, return result; } - return writer->get_chars_written(); + // TODO: Use ErrorOr<size_t> + return static_cast<int>(writer->get_chars_written()); } } // namespace strftime_core diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index eec108bc12ca5..d71f1dff11943 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -186,6 +186,8 @@ add_libc_test( fprintf_test.cpp DEPENDS libc.src.stdio.fprintf + libc.test.UnitTest.ErrnoCheckingTest + libc.test.UnitTest.ErrnoSetterMatcher ${fprintf_test_deps} COMPILE_OPTIONS ${use_system_file} diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp index 6799323cc6ad9..1b35a09645939 100644 --- a/libc/test/src/stdio/fprintf_test.cpp +++ b/libc/test/src/stdio/fprintf_test.cpp @@ -15,6 +15,9 @@ #include "src/stdio/fprintf.h" +#include "src/__support/CPP/limits.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" namespace printf_test { @@ -31,6 +34,8 @@ using ::fread; #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace printf_test +using LlvmLibcFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + TEST(LlvmLibcFPrintfTest, WriteToFile) { const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test"); auto FILE_PATH = libc_make_test_file_path(FILENAME); @@ -78,6 +83,25 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) { written = LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); + ASSERT_ERRNO_EQ(EBADF); + + ASSERT_EQ(printf_test::fclose(file), 0); +} + +#if !defined(LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS) && \ + !defined(LIBC_COPT_PRINTF_DISABLE_WRITE_INT) +TEST(LlvmLibcFPrintfTest, NullPtrCheck) { + const char *FILENAME = APPEND_LIBC_TEST("fprintf_nullptr.test"); + auto FILE_PATH = libc_make_test_file_path(FILENAME); + + ::FILE *file = printf_test::fopen(FILE_PATH, "w"); + ASSERT_FALSE(file == nullptr); + + int ret = + LIBC_NAMESPACE::fprintf(file, "hello %n", static_cast<int *>(nullptr)); + EXPECT_LT(ret, 0); + ASSERT_ERRNO_EQ(EINVAL); ASSERT_EQ(printf_test::fclose(file), 0); } +#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp index bf088937e4104..2dae2a22c864c 100644 --- a/libc/test/src/stdio/printf_core/converter_test.cpp +++ b/libc/test/src/stdio/printf_core/converter_test.cpp @@ -38,7 +38,7 @@ TEST_F(LlvmLibcPrintfConverterTest, SimpleRawConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "abc"); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) { @@ -52,7 +52,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "%"); - ASSERT_EQ(writer.get_chars_written(), 1); + ASSERT_EQ(writer.get_chars_written(), size_t{1}); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) { @@ -70,7 +70,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "D"); - ASSERT_EQ(writer.get_chars_written(), 1); + ASSERT_EQ(writer.get_chars_written(), size_t{1}); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) { @@ -85,7 +85,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, " E"); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) { @@ -102,7 +102,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "F "); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { @@ -118,7 +118,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "DEF"); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { @@ -133,7 +133,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "456"); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) { @@ -148,7 +148,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "xy"); - ASSERT_EQ(writer.get_chars_written(), 2); + ASSERT_EQ(writer.get_chars_written(), size_t{2}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) { @@ -163,7 +163,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, " 789"); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) { @@ -180,7 +180,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "ghi "); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) { @@ -194,7 +194,7 @@ TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "12345"); - ASSERT_EQ(writer.get_chars_written(), 5); + ASSERT_EQ(writer.get_chars_written(), size_t{5}); } TEST_F(LlvmLibcPrintfConverterTest, HexConversion) { @@ -211,7 +211,7 @@ TEST_F(LlvmLibcPrintfConverterTest, HexConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "0x00000000123456ab"); - ASSERT_EQ(writer.get_chars_written(), 18); + ASSERT_EQ(writer.get_chars_written(), size_t{18}); } TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) { @@ -225,7 +225,7 @@ TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "101010"); - ASSERT_EQ(writer.get_chars_written(), 6); + ASSERT_EQ(writer.get_chars_written(), size_t{6}); } TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) { @@ -239,7 +239,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "0x123456ab"); - ASSERT_EQ(writer.get_chars_written(), 10); + ASSERT_EQ(writer.get_chars_written(), size_t{10}); } TEST_F(LlvmLibcPrintfConverterTest, OctConversion) { @@ -253,5 +253,5 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "1234"); - ASSERT_EQ(writer.get_chars_written(), 4); + ASSERT_EQ(writer.get_chars_written(), size_t{4}); } diff --git a/libc/test/src/stdio/printf_core/writer_test.cpp b/libc/test/src/stdio/printf_core/writer_test.cpp index d036341be7981..d263cf55aa474 100644 --- a/libc/test/src/stdio/printf_core/writer_test.cpp +++ b/libc/test/src/stdio/printf_core/writer_test.cpp @@ -39,7 +39,7 @@ TEST(LlvmLibcPrintfWriterTest, Write) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abc", str); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) { @@ -53,7 +53,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abcDEF123", str); - ASSERT_EQ(writer.get_chars_written(), 9); + ASSERT_EQ(writer.get_chars_written(), size_t{9}); } TEST(LlvmLibcPrintfWriterTest, WriteChars) { @@ -66,7 +66,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteChars) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaa", str); - ASSERT_EQ(writer.get_chars_written(), 3); + ASSERT_EQ(writer.get_chars_written(), size_t{3}); } TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) { @@ -80,7 +80,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDDD111", str); - ASSERT_EQ(writer.get_chars_written(), 9); + ASSERT_EQ(writer.get_chars_written(), size_t{9}); } TEST(LlvmLibcPrintfWriterTest, WriteManyChars) { @@ -102,7 +102,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteManyChars) { "ZZZZZZZZZZ" "ZZZZZZZZZ", str); - ASSERT_EQ(writer.get_chars_written(), 99); + ASSERT_EQ(writer.get_chars_written(), size_t{99}); } TEST(LlvmLibcPrintfWriterTest, MixedWrites) { @@ -117,7 +117,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWrites) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) { @@ -129,7 +129,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abcDEF1234", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) { @@ -141,7 +141,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("1111111111", str); - ASSERT_EQ(writer.get_chars_written(), 15); + ASSERT_EQ(writer.get_chars_written(), size_t{15}); } TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) { @@ -157,7 +157,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDEF1114", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) { @@ -175,7 +175,7 @@ TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) { @@ -187,7 +187,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) { writer.write('1', 3); writer.write({"456", 3}); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } struct OutBuff { @@ -226,7 +226,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("abcDEF123456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) { @@ -246,7 +246,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("111111111111111", str); - ASSERT_EQ(writer.get_chars_written(), 15); + ASSERT_EQ(writer.get_chars_written(), size_t{15}); } TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) { @@ -269,7 +269,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) { @@ -292,7 +292,7 @@ TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); } TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) { @@ -312,7 +312,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) { wb.overflow_write(""); str[out_buff.cur_pos] = '\0'; - ASSERT_EQ(writer.get_chars_written(), 12); + ASSERT_EQ(writer.get_chars_written(), size_t{12}); ASSERT_STREQ("aaaDEF111456", str); } diff --git a/libc/test/src/stdio/snprintf_test.cpp b/libc/test/src/stdio/snprintf_test.cpp index baaa664cdc9ee..95507e0885dbf 100644 --- a/libc/test/src/stdio/snprintf_test.cpp +++ b/libc/test/src/stdio/snprintf_test.cpp @@ -8,8 +8,12 @@ #include "src/stdio/snprintf.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" +using LlvmLibcSNPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + // The sprintf test cases cover testing the shared printf functionality, so // these tests will focus on snprintf exclusive features. @@ -59,3 +63,14 @@ TEST(LlvmLibcSNPrintfTest, NoCutOff) { EXPECT_EQ(written, 10); ASSERT_STREQ(buff, "1234567890"); } + +TEST(LlvmLibcSNPrintfTest, CharsWrittenOverflow) { + char buff[0]; + + // Trigger an overflow in the return value of snprintf by writing more than + // INT_MAX bytes. + int int_max = LIBC_NAMESPACE::cpp::numeric_limits<int>::max(); + int written = LIBC_NAMESPACE::snprintf(buff, 0, "%*stest", int_max, ""); + EXPECT_LT(written, 0); + ASSERT_ERRNO_FAILURE(); +} diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp index f50565a0f68ca..9b5f09db8fd41 100644 --- a/libc/test/src/stdio/vfprintf_test.cpp +++ b/libc/test/src/stdio/vfprintf_test.cpp @@ -19,6 +19,8 @@ #include "src/stdio/vfprintf.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" namespace printf_test { @@ -44,6 +46,8 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format, return ret; } +using LlvmLibcVFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + TEST(LlvmLibcVFPrintfTest, WriteToFile) { const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test"); auto FILE_PATH = libc_make_test_file_path(FILENAME); @@ -90,6 +94,7 @@ TEST(LlvmLibcVFPrintfTest, WriteToFile) { written = call_vfprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); + ASSERT_ERRNO_EQ(EBADF); ASSERT_EQ(printf_test::fclose(file), 0); } diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h index e82c94499aa11..fd2e0f120e90e 100644 --- a/libc/test/src/stdlib/StrfromTest.h +++ b/libc/test/src/stdlib/StrfromTest.h @@ -8,6 +8,8 @@ #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/FPBits.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #define ASSERT_STREQ_LEN(actual_written, actual_str, expected_str) \ @@ -15,7 +17,7 @@ EXPECT_STREQ(actual_str, expected_str); template <typename InputT> -class StrfromTest : public LIBC_NAMESPACE::testing::Test { +class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { static constexpr bool is_single_prec = LIBC_NAMESPACE::cpp::is_same<InputT, float>::value; @@ -481,6 +483,16 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test { written = func(buff, 10, "%A", -ld_nan); ASSERT_STREQ_LEN(written, buff, "-NAN"); } + + void charsWrittenOverflow(FunctionT func) { + char buff[100]; + // Trigger an overflow in the return value of strfrom by writing more than + // INT_MAX bytes. + int result = func(buff, sizeof(buff), "%.2147483647f", 1.0f); + + EXPECT_LT(result, 0); + ASSERT_ERRNO_FAILURE(); + } }; #define STRFROM_TEST(InputType, name, func) \ @@ -501,4 +513,7 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test { TEST_F(LlvmLibc##name##Test, InsufficientBufferSize) { \ insufficentBufsize(func); \ } \ - TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); } + TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); } \ + TEST_F(LlvmLibc##name##Test, CharsWrittenOverflow) { \ + charsWrittenOverflow(func); \ + } From 6d4e75cc931a82108526dafb645f68494eb45973 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu <min.hsu@sifive.com> Date: Tue, 4 Nov 2025 16:21:37 -0800 Subject: [PATCH 250/313] [MISched][NFC] Rename isUnbufferedGroup to isReservedGroup (#166439) In both ScheduleDAGInstrs and MachineScheduler, we call `BufferSize = 0` as _reserved_ and `BufferSize = 1` as _unbuffered_. This convention is stem from the fact that we set `SUnit::hasReservedResource` to true when any of the SUnit's consumed resources has BufferSize equal to zero; set `SUnit::isUnbuffered` to true when any of its consumed resources has BufferSize equal to one. However, `SchedBoundary::isUnbufferedGroup` doesn't really follow this convention: it returns true when the resource in question is a `ProcResGroup` and its BufferSize equals to **zero** rather than one. This could be really confusing for the reader. This patch renames this function to `isReservedGroup` in aligned with the convention mentioned above. NFC. --- llvm/include/llvm/CodeGen/MachineScheduler.h | 2 +- llvm/lib/CodeGen/MachineScheduler.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index 6c5c27c9662e4..7b965d400ed08 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -1038,7 +1038,7 @@ class SchedBoundary { getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, unsigned ReleaseAtCycle, unsigned AcquireAtCycle); - bool isUnbufferedGroup(unsigned PIdx) const { + bool isReservedGroup(unsigned PIdx) const { return SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin && !SchedModel->getProcResource(PIdx)->BufferSize; } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index f18c051142960..73993705c4a7b 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -2559,7 +2559,7 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { for (unsigned i = 0; i < ResourceCount; ++i) { ReservedCyclesIndex[i] = NumUnits; NumUnits += SchedModel->getProcResource(i)->NumUnits; - if (isUnbufferedGroup(i)) { + if (isReservedGroup(i)) { auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin; for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits; U != UE; ++U) @@ -2631,7 +2631,7 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, assert(NumberOfInstances > 0 && "Cannot have zero instances of a ProcResource"); - if (isUnbufferedGroup(PIdx)) { + if (isReservedGroup(PIdx)) { // If any subunits are used by the instruction, report that the // subunits of the resource group are available at the first cycle // in which the unit is available, effectively removing the group From fa5cd27ef0fc0b2c17a58407851e0aa0522d788b Mon Sep 17 00:00:00 2001 From: Jin Huang <jinhuang1102@gmail.com> Date: Tue, 4 Nov 2025 16:23:34 -0800 Subject: [PATCH 251/313] [profcheck] Add unknown branch weights to expand LL/SR loop. (#166273) As a follow-up to PR#165841, this change addresses `prof_md` metadata loss in AtomicExpandPass when lowering `atomicrmw xchg` to a Load-Linked/Store-Exclusive (LL/SC) loop. This path is distinct from the LSE path addressed previously: PR #165841 (and its tests) used `-mtriple=aarch64-linux-gnu`, which targets a modern **ARMv8.1+** architecture. This architecture supports **Large System Extensions (LSE)**, allowing `atomicrmw` to be lowered directly to a more efficient hardware instruction. This PR (and its tests) uses `-mtriple=aarch64--` or `-mtriple=armv8-linux-gnueabihf`. This indicates an `ARMv8.0 or lower architecture that does not support LSE`. On these targets, the pass must fall back to synthesizing a manual LL/SC loop using the `ldaxr/stxr` instruction pair. Similar to previous issue, the new conditional branch was failin to inherit the `prof_md` metadata. Theis PR correctly fix the branch weights to the newly created branch within the LL/SC loop, ensuring profile information is preserved. Co-authored-by: Jin Huang <jingold@google.com> --- llvm/lib/CodeGen/AtomicExpandPass.cpp | 12 ++++++-- .../AArch64/expand-atomicrmw-xchg-fp.ll | 28 ++++++++++++++----- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 53f1cfe24a68d..6412949948c07 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -38,6 +38,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -1259,8 +1260,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop( BasicBlock *BB = Builder.GetInsertBlock(); Function *F = BB->getParent(); - assert(AddrAlign >= - F->getDataLayout().getTypeStoreSize(ResultTy) && + assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) && "Expected at least natural alignment at this point."); // Given: atomicrmw some_op iN* %addr, iN %incr ordering @@ -1295,7 +1295,13 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop( TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); Value *TryAgain = Builder.CreateICmpNE( StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); - Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); + + Instruction *CondBr = Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); + + // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is + // hard to predict precise branch weigths we mark the branch as "unknown" + // (50/50) to prevent misleading optimizations. + setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, *F, DEBUG_TYPE); Builder.SetInsertPoint(ExitBB, ExitBB->begin()); return Loaded; diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll index 95a52aa0f7f52..b509b2469cfdc 100644 --- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll +++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll @@ -1,8 +1,8 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -passes=atomic-expand %s | FileCheck %s ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -mattr=+outline-atomics -passes=atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS -define void @atomic_swap_f16(ptr %ptr, half %val) nounwind { +define void @atomic_swap_f16(ptr %ptr, half %val) !prof !0 { ; CHECK-LABEL: @atomic_swap_f16( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VAL:%.*]] to i16 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] @@ -12,7 +12,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind { ; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i16) [[PTR]]) ; CHECK-NEXT: [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]] +; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to half ; CHECK-NEXT: ret void @@ -27,7 +27,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind { ret void } -define void @atomic_swap_f32(ptr %ptr, float %val) nounwind { +define void @atomic_swap_f32(ptr %ptr, float %val) nounwind !prof !0 { ; CHECK-LABEL: @atomic_swap_f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAL:%.*]] to i32 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] @@ -37,7 +37,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind { ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i32) [[PTR]]) ; CHECK-NEXT: [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]] +; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[TMP3]] to float ; CHECK-NEXT: ret void @@ -52,7 +52,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind { ret void } -define void @atomic_swap_f64(ptr %ptr, double %val) nounwind { +define void @atomic_swap_f64(ptr %ptr, double %val) nounwind !prof !0 { ; CHECK-LABEL: @atomic_swap_f64( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double [[VAL:%.*]] to i64 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] @@ -60,7 +60,7 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind { ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i64) [[PTR:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP1]], ptr elementtype(i64) [[PTR]]) ; CHECK-NEXT: [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]] +; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP2]] to double ; CHECK-NEXT: ret void @@ -74,3 +74,17 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind { %t1 = atomicrmw xchg ptr %ptr, double %val acquire ret void } + +!0 = !{!"function_entry_count", i64 1000} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nounwind willreturn } +;. +; OUTLINE-ATOMICS: attributes #[[ATTR0:[0-9]+]] = { "target-features"="+outline-atomics" } +; OUTLINE-ATOMICS: attributes #[[ATTR1:[0-9]+]] = { nounwind "target-features"="+outline-atomics" } +;. +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +; CHECK: [[PROF1]] = !{!"unknown", !"atomic-expand"} +;. +; OUTLINE-ATOMICS: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +;. From 3ebed51e997b2a517eecb53a78cc0b6ffcdc00a3 Mon Sep 17 00:00:00 2001 From: Dan Liew <dan@su-root.co.uk> Date: Tue, 4 Nov 2025 16:25:33 -0800 Subject: [PATCH 252/313] [Clang][LLDB] Refactor trap reason demangling out of LLDB and into Clang (#165996) This patch refactors the trap reason demangling logic in `lldb_private::VerboseTrapFrameRecognizer::RecognizeFrame` into a new public function `clang::CodeGen::DemangleTrapReasonInDebugInfo`. There are two reasons for doing this: 1. In a future patch the logic for demangling needs to be used somewhere else in LLDB and thus the logic needs refactoring to avoid duplicating code. 2. The logic for demangling shouldn't really be in LLDB anyway because it's a Clang implementation detail and thus the logic really belongs inside Clang, not LLDB. Unit tests have been added for the new function that demonstrate how to use the new API. The function names recognized by VerboseTrapFrameRecognizer are identical to before. However, this patch isn't NFC because: * The `lldbTarget` library now links against `clangCodeGen` which it didn't previously. * The LLDB logging output is a little different now. The previous code tried to log failures for an invalid regex pattern and for the `Regex::match` API not returning the correct number of matches. These failure conditions are unreachable via unit testing so they have been made assertions failures inside the `DemangleTrapReasonInDebugInfo` implementation instead of trying to log them in LLDB. rdar://163230807 --- clang/include/clang/CodeGen/ModuleBuilder.h | 17 +++++ clang/lib/CodeGen/ModuleBuilder.cpp | 29 ++++++++ clang/unittests/CodeGen/CMakeLists.txt | 1 + .../CodeGen/DemangleTrapReasonInDebugInfo.cpp | 67 +++++++++++++++++++ .../LanguageRuntime/CPlusPlus/CMakeLists.txt | 2 + .../CPlusPlus/VerboseTrapFrameRecognizer.cpp | 31 ++------- 6 files changed, 122 insertions(+), 25 deletions(-) create mode 100644 clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp diff --git a/clang/include/clang/CodeGen/ModuleBuilder.h b/clang/include/clang/CodeGen/ModuleBuilder.h index f1b8229edd362..4298ba06c472e 100644 --- a/clang/include/clang/CodeGen/ModuleBuilder.h +++ b/clang/include/clang/CodeGen/ModuleBuilder.h @@ -120,6 +120,23 @@ CodeGenerator *CreateLLVMCodeGen(DiagnosticsEngine &Diags, llvm::LLVMContext &C, CoverageSourceInfo *CoverageInfo = nullptr); +namespace CodeGen { +/// Demangle the artificial function name (\param FuncName) used to encode trap +/// reasons used in debug info for traps (e.g. __builtin_verbose_trap). See +/// `CGDebugInfo::CreateTrapFailureMessageFor`. +/// +/// \param FuncName - The function name to demangle. +/// +/// \return A std::optional. If demangling succeeds the optional will contain +/// a pair of StringRefs where the first field is the trap category and the +/// second is the trap message. These can both be empty. If demangling fails the +/// optional will not contain a value. Note the returned StringRefs if non-empty +/// point into the underlying storage for \param FuncName and thus have the same +/// lifetime. +std::optional<std::pair<StringRef, StringRef>> +DemangleTrapReasonInDebugInfo(StringRef FuncName); +} // namespace CodeGen + } // end namespace clang #endif diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp index 96f3f6221e20f..8ec8aef311656 100644 --- a/clang/lib/CodeGen/ModuleBuilder.cpp +++ b/clang/lib/CodeGen/ModuleBuilder.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/VirtualFileSystem.h" #include <memory> @@ -378,3 +379,31 @@ clang::CreateLLVMCodeGen(DiagnosticsEngine &Diags, llvm::StringRef ModuleName, HeaderSearchOpts, PreprocessorOpts, CGO, C, CoverageInfo); } + +namespace clang { +namespace CodeGen { +std::optional<std::pair<StringRef, StringRef>> +DemangleTrapReasonInDebugInfo(StringRef FuncName) { + static auto TrapRegex = + llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str()); + llvm::SmallVector<llvm::StringRef, 3> Matches; + std::string *ErrorPtr = nullptr; +#ifndef NDEBUG + std::string Error; + ErrorPtr = &Error; +#endif + if (!TrapRegex.match(FuncName, &Matches, ErrorPtr)) { + assert(ErrorPtr && ErrorPtr->empty() && "Invalid regex pattern"); + return {}; + } + + if (Matches.size() != 3) { + assert(0 && "Expected 3 matches from Regex::match"); + return {}; + } + + // Returns { Trap Category, Trap Message } + return std::make_pair(Matches[1], Matches[2]); +} +} // namespace CodeGen +} // namespace clang diff --git a/clang/unittests/CodeGen/CMakeLists.txt b/clang/unittests/CodeGen/CMakeLists.txt index f5bcecb0b08a3..d4efb2230a054 100644 --- a/clang/unittests/CodeGen/CMakeLists.txt +++ b/clang/unittests/CodeGen/CMakeLists.txt @@ -1,6 +1,7 @@ add_clang_unittest(ClangCodeGenTests BufferSourceTest.cpp CodeGenExternalTest.cpp + DemangleTrapReasonInDebugInfo.cpp TBAAMetadataTest.cpp CheckTargetFeaturesTest.cpp CLANG_LIBS diff --git a/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp new file mode 100644 index 0000000000000..17bfe17c31d65 --- /dev/null +++ b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp @@ -0,0 +1,67 @@ +//=== unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/CodeGen/ModuleBuilder.h" +#include "llvm/ADT/StringRef.h" +#include "gtest/gtest.h" + +using namespace clang::CodeGen; + +void CheckValidCommon(llvm::StringRef FuncName, const char *ExpectedCategory, + const char *ExpectedMessage) { + auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName); + ASSERT_TRUE(MaybeTrapReason.has_value()); + auto [Category, Message] = MaybeTrapReason.value(); + ASSERT_STREQ(Category.str().c_str(), ExpectedCategory); + ASSERT_STREQ(Message.str().c_str(), ExpectedMessage); +} + +void CheckInvalidCommon(llvm::StringRef FuncName) { + auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName); + ASSERT_TRUE(!MaybeTrapReason.has_value()); +} + +TEST(DemangleTrapReasonInDebugInfo, Valid) { + std::string FuncName(ClangTrapPrefix); + FuncName += "$trap category$trap message"; + CheckValidCommon(FuncName, "trap category", "trap message"); +} + +TEST(DemangleTrapReasonInDebugInfo, ValidEmptyCategory) { + std::string FuncName(ClangTrapPrefix); + FuncName += "$$trap message"; + CheckValidCommon(FuncName, "", "trap message"); +} + +TEST(DemangleTrapReasonInDebugInfo, ValidEmptyMessage) { + std::string FuncName(ClangTrapPrefix); + FuncName += "$trap category$"; + CheckValidCommon(FuncName, "trap category", ""); +} + +TEST(DemangleTrapReasonInDebugInfo, ValidAllEmpty) { + // `__builtin_verbose_trap` actually allows this + // currently. However, we should probably disallow this in Sema because having + // an empty category and message completely defeats the point of using the + // builtin (#165981). + std::string FuncName(ClangTrapPrefix); + FuncName += "$$"; + CheckValidCommon(FuncName, "", ""); +} + +TEST(DemangleTrapReasonInDebugInfo, InvalidOnlyPrefix) { + std::string FuncName(ClangTrapPrefix); + CheckInvalidCommon(FuncName); +} + +TEST(DemangleTrapReasonInDebugInfo, Invalid) { + std::string FuncName("foo"); + CheckInvalidCommon(FuncName); +} + +TEST(DemangleTrapReasonInDebugInfo, InvalidEmpty) { CheckInvalidCommon(""); } diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt index a27bceffe2e3a..727c8290bceb4 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt @@ -6,6 +6,8 @@ add_lldb_library(lldbPluginCPPRuntime lldbCore lldbSymbol lldbTarget + CLANG_LIBS + clangCodeGen ) add_subdirectory(ItaniumABI) diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp index 730aba5b42a3e..2b6bf2cd470e6 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp @@ -95,33 +95,14 @@ VerboseTrapFrameRecognizer::RecognizeFrame(lldb::StackFrameSP frame_sp) { if (func_name.empty()) return {}; - static auto trap_regex = - llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str()); - SmallVector<llvm::StringRef, 3> matches; - std::string regex_err_msg; - if (!trap_regex.match(func_name, &matches, ®ex_err_msg)) { - LLDB_LOGF(GetLog(LLDBLog::Unwind), - "Failed to parse match trap regex for '%s': %s", func_name.data(), - regex_err_msg.c_str()); - - return {}; - } - - // For `__clang_trap_msg$category$message$` we expect 3 matches: - // 1. entire string - // 2. category - // 3. message - if (matches.size() != 3) { - LLDB_LOGF(GetLog(LLDBLog::Unwind), - "Unexpected function name format. Expected '<trap prefix>$<trap " - "category>$<trap message>'$ but got: '%s'.", - func_name.data()); - + auto maybe_trap_reason = + clang::CodeGen::DemangleTrapReasonInDebugInfo(func_name); + if (!maybe_trap_reason.has_value()) { + LLDB_LOGF(GetLog(LLDBLog::Unwind), "Failed to demangle '%s' as trap reason", + func_name.str().c_str()); return {}; } - - auto category = matches[1]; - auto message = matches[2]; + auto [category, message] = maybe_trap_reason.value(); std::string stop_reason = category.empty() ? "<empty category>" : category.str(); From d54793113798f74424f21b94fe74bd675a11e801 Mon Sep 17 00:00:00 2001 From: Mircea Trofin <mtrofin@google.com> Date: Tue, 4 Nov 2025 16:39:12 -0800 Subject: [PATCH 253/313] [SLU][profcheck] create likely branch weights for guard->branch (#164271) The `llvm.experimental.guard` intrinsic is a `call`, so its metadata - if present - would be one value (as per `Verifier::visitProfMetadata`). That wouldn't be a correct `branch_weights` metadata. Likely, `GI->getMetadata(LLVMContext::MD_prof)` was always `nullptr`. We can bias away from deopt instead. Issue #147390 --- .../Transforms/Scalar/SimpleLoopUnswitch.cpp | 12 ++++++--- .../Transforms/SimpleLoopUnswitch/guards.ll | 26 ++++++++++++------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 239526e85e1fd..86b2090081ed0 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" @@ -2831,9 +2832,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, MSSAU->getMemorySSA()->verifyMemorySSA(); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - Instruction *DeoptBlockTerm = - SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true, - GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI); + // llvm.experimental.guard doesn't have branch weights. We can assume, + // however, that the deopt path is unlikely. + Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen( + GI->getArgOperand(0), GI, true, + !ProfcheckDisableMetadataFixes && EstimateProfile + ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights() + : nullptr, + &DTU, &LI); BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); // SplitBlockAndInsertIfThen inserts control flow that branches to // DeoptBlockTerm if the condition is true. We want the opposite. diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll index 706b49df14749..42b32e769d8d7 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll @@ -1,15 +1,15 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -S < %s | FileCheck %s ; RUN: opt -passes='simple-loop-unswitch<nontrivial>' -simple-loop-unswitch-guards -S < %s | FileCheck %s ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -verify-memoryssa -verify-loop-info -S < %s | FileCheck %s declare void @llvm.experimental.guard(i1, ...) -define void @test_simple_case(i1 %cond, i32 %N) { +define void @test_simple_case(i1 %cond, i32 %N) !prof !0 { ; CHECK-LABEL: define void @test_simple_case( -; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) !prof [[PROF0:![0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]] +; CHECK-NEXT: br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1:![0-9]+]] ; CHECK: [[ENTRY_SPLIT_US]]: ; CHECK-NEXT: br label %[[LOOP_US:.*]] ; CHECK: [[LOOP_US]]: @@ -50,9 +50,9 @@ define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) { ; CHECK-LABEL: define void @test_two_guards( ; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]] +; CHECK-NEXT: br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]] ; CHECK: [[ENTRY_SPLIT_US]]: -; CHECK-NEXT: br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label %[[ENTRY_SPLIT_US_SPLIT:.*]] +; CHECK-NEXT: br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label %[[ENTRY_SPLIT_US_SPLIT:.*]], !prof [[PROF1]] ; CHECK: [[ENTRY_SPLIT_US_SPLIT_US]]: ; CHECK-NEXT: br label %[[LOOP_US_US:.*]] ; CHECK: [[LOOP_US_US]]: @@ -108,7 +108,7 @@ define void @test_conditional_guards(i1 %cond, i32 %N) { ; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[COND]] -; CHECK-NEXT: br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]] +; CHECK-NEXT: br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]] ; CHECK: [[ENTRY_SPLIT_US]]: ; CHECK-NEXT: br label %[[LOOP_US:.*]] ; CHECK: [[LOOP_US]]: @@ -171,7 +171,7 @@ define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) { ; CHECK-LABEL: define void @test_nested_loop( ; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]], i1 [[ARG:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[OUTER_LOOP_SPLIT:.*]] +; CHECK-NEXT: br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[OUTER_LOOP_SPLIT:.*]], !prof [[PROF1]] ; CHECK: [[ENTRY_SPLIT]]: ; CHECK-NEXT: br i1 [[ARG]], label %[[ENTRY_SPLIT_SPLIT_US:.*]], label %[[ENTRY_SPLIT_SPLIT:.*]] ; CHECK: [[ENTRY_SPLIT_SPLIT_US]]: @@ -243,7 +243,7 @@ define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) { ; CHECK-LABEL: define void @test_sibling_loops( ; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]] +; CHECK-NEXT: br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]] ; CHECK: [[ENTRY_SPLIT_US]]: ; CHECK-NEXT: br label %[[LOOP1_US:.*]] ; CHECK: [[LOOP1_US]]: @@ -263,7 +263,7 @@ define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) { ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable ; CHECK: [[BETWEEN]]: -; CHECK-NEXT: br i1 [[COND2]], label %[[BETWEEN_SPLIT_US2:.*]], label %[[BETWEEN_SPLIT:.*]] +; CHECK-NEXT: br i1 [[COND2]], label %[[BETWEEN_SPLIT_US2:.*]], label %[[BETWEEN_SPLIT:.*]], !prof [[PROF1]] ; CHECK: [[BETWEEN_SPLIT_US2]]: ; CHECK-NEXT: br label %[[LOOP2_US:.*]] ; CHECK: [[LOOP2_US]]: @@ -343,3 +343,9 @@ exit: declare void @may_throw(i32 %i) declare i32 @__CxxFrameHandler3(...) + +!0 = !{!"function_entry_count", i32 10} +;. +; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1048575, i32 1} +;. From 02f5a1aa07f1654a28d075dec6fc0ac5da0801b6 Mon Sep 17 00:00:00 2001 From: Mircea Trofin <mtrofin@google.com> Date: Tue, 4 Nov 2025 16:39:26 -0800 Subject: [PATCH 254/313] [ADT] Introduce Casting function objects (#165803) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding casting function objects as a convenience for expressing e.g. `auto AsDoubles = map_range(RangeOfInts, StaticCastTo<double>)`​ --- llvm/include/llvm/Support/Casting.h | 50 +++++++++++++++++++++++++++++ llvm/unittests/Support/Casting.cpp | 41 +++++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h index 6f6df2e9703ea..a6435a2562a2b 100644 --- a/llvm/include/llvm/Support/Casting.h +++ b/llvm/include/llvm/Support/Casting.h @@ -816,6 +816,42 @@ template <typename... Types> struct IsaAndPresentCheckPredicate { return isa_and_present<Types...>(Val); } }; + +//===----------------------------------------------------------------------===// +// Casting Function Objects +//===----------------------------------------------------------------------===// + +/// Usable in generic algorithms like map_range +template <typename U> struct StaticCastFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return static_cast<U>(Val); + } +}; + +template <typename U> struct DynCastFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return dyn_cast<U>(Val); + } +}; + +template <typename U> struct CastFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return cast<U>(Val); + } +}; + +template <typename U> struct CastIfPresentFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return cast_if_present<U>(Val); + } +}; + +template <typename U> struct DynCastIfPresentFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return dyn_cast_if_present<U>(Val); + } +}; + } // namespace detail /// Function object wrapper for the `llvm::isa` type check. The function call @@ -841,6 +877,20 @@ template <typename... Types> inline constexpr detail::IsaAndPresentCheckPredicate<Types...> IsaAndPresentPred{}; +/// Function objects corresponding to the Cast types defined above. +template <typename From> +inline constexpr detail::StaticCastFunc<From> StaticCastTo{}; + +template <typename From> inline constexpr detail::CastFunc<From> CastTo{}; + +template <typename From> +inline constexpr detail::CastIfPresentFunc<From> CastIfPresentTo{}; + +template <typename From> +inline constexpr detail::DynCastIfPresentFunc<From> DynCastIfPresentTo{}; + +template <typename From> inline constexpr detail::DynCastFunc<From> DynCastTo{}; + } // end namespace llvm #endif // LLVM_SUPPORT_CASTING_H diff --git a/llvm/unittests/Support/Casting.cpp b/llvm/unittests/Support/Casting.cpp index 790675083614b..0df8b9fcab452 100644 --- a/llvm/unittests/Support/Casting.cpp +++ b/llvm/unittests/Support/Casting.cpp @@ -561,6 +561,47 @@ TEST(CastingTest, assertion_check_unique_ptr) { << "Invalid cast of const ref did not cause an abort()"; } +TEST(Casting, StaticCastPredicate) { + uint32_t Value = 1; + + static_assert( + std::is_same_v<decltype(StaticCastTo<uint64_t>(Value)), uint64_t>); +} + +TEST(Casting, LLVMRTTIPredicates) { + struct Base { + enum Kind { BK_Base, BK_Derived }; + const Kind K; + Base(Kind K = BK_Base) : K(K) {} + Kind getKind() const { return K; } + virtual ~Base() = default; + }; + + struct Derived : Base { + Derived() : Base(BK_Derived) {} + static bool classof(const Base *B) { return B->getKind() == BK_Derived; } + bool Field = false; + }; + + Base B; + Derived D; + Base *BD = &D; + Base *Null = nullptr; + + // Pointers. + EXPECT_EQ(DynCastTo<Derived>(BD), &D); + EXPECT_EQ(CastTo<Derived>(BD), &D); + EXPECT_EQ(DynCastTo<Derived>(&B), nullptr); + EXPECT_EQ(CastIfPresentTo<Derived>(BD), &D); + EXPECT_EQ(CastIfPresentTo<Derived>(Null), nullptr); + EXPECT_EQ(DynCastIfPresentTo<Derived>(BD), &D); + EXPECT_EQ(DynCastIfPresentTo<Derived>(Null), nullptr); + + Base &R = D; + CastTo<Derived>(R).Field = true; + EXPECT_TRUE(D.Field); +} + } // end namespace assertion_checks #endif } // end namespace From 4209e41c4c1ba7c9595360c4c0ab3e85d13000f2 Mon Sep 17 00:00:00 2001 From: Michael Jones <michaelrj@google.com> Date: Tue, 4 Nov 2025 16:42:11 -0800 Subject: [PATCH 255/313] [libc] Fix printf long double bugs (#166474) Found in testing against abseil. Two bugs were found: 1) SHIFT_AMOUNT in float_converter<long double> would sometimes be negative causing an underflow when passed as the amount to left shift by for BigInt. 2) is_lowest_block had an off-by-one because it was adding 1 to the block index. Both are fixed and there are new tests to catch regressions. --- libc/src/__support/float_to_string.h | 8 ++++++-- libc/test/src/stdio/sprintf_test.cpp | 12 ++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h index cab146a5b8698..9115ed2856881 100644 --- a/libc/src/__support/float_to_string.h +++ b/libc/src/__support/float_to_string.h @@ -700,7 +700,11 @@ template <> class FloatToString<long double> { const int SHIFT_AMOUNT = FLOAT_AS_INT_WIDTH + exponent; static_assert(EXTRA_INT_WIDTH >= sizeof(long double) * 8); - float_as_fixed <<= SHIFT_AMOUNT; + if (SHIFT_AMOUNT > 0) { + float_as_fixed <<= SHIFT_AMOUNT; + } else { + float_as_fixed >>= -SHIFT_AMOUNT; + } // If there are still digits above the decimal point, handle those. if (cpp::countl_zero(float_as_fixed) < @@ -769,7 +773,7 @@ template <> class FloatToString<long double> { // The decimal representation of 2**(-i) will have exactly i digits after // the decimal point. const int num_requested_digits = - static_cast<int>((negative_block_index + 1) * BLOCK_SIZE); + static_cast<int>(negative_block_index * BLOCK_SIZE); return num_requested_digits > -exponent; } diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp index f1b545ba546f9..42fdd59cf4d9c 100644 --- a/libc/test/src/stdio/sprintf_test.cpp +++ b/libc/test/src/stdio/sprintf_test.cpp @@ -1537,6 +1537,14 @@ TEST(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) { #if defined(LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80) #ifndef LIBC_COPT_FLOAT_TO_STR_REDUCED_PRECISION + written = LIBC_NAMESPACE::sprintf( + buff, "%.75Lf", + 0.0833333333333333333355920878593448009041821933351457118988037109375L); + ASSERT_STREQ_LEN(written, buff, + "0." + "08333333333333333333559208785934480090418219333514571189880" + "3710937500000000"); + written = LIBC_NAMESPACE::sprintf(buff, "%Lf", 1e100L); ASSERT_STREQ_LEN(written, buff, "99999999999999999996693535322073426194986990198284960792713" @@ -2976,6 +2984,10 @@ TEST(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) { written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xf.fffffffffffffffp+16380L); ASSERT_STREQ_LEN(written, buff, "1.18973e+4932"); + // Minimum normal + written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 3.36210314311209350626E-4932L); + ASSERT_STREQ_LEN(written, buff, "3.3621e-4932"); + written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xa.aaaaaaaaaaaaaabp-7L); ASSERT_STREQ_LEN(written, buff, "0.0833333"); From 2b60b6d9639c240ebafc8b517425453e50e14a7b Mon Sep 17 00:00:00 2001 From: Paul Kirth <paulkirth@google.com> Date: Tue, 4 Nov 2025 16:46:09 -0800 Subject: [PATCH 256/313] [llvm][mustache] Avoid extra allocations in parseSection (#159199) We don't need to have extra allocations when concatenating raw bodies. --- llvm/lib/Support/Mustache.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 9eb1ec2b8425c..6c140be59fc4b 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -599,9 +599,16 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, size_t Start = CurrentPtr; parseMustache(CurrentNode); const size_t End = CurrentPtr - 1; + + size_t RawBodySize = 0; + for (size_t I = Start; I < End; ++I) + RawBodySize += Tokens[I].RawBody.size(); + SmallString<128> RawBody; - for (std::size_t I = Start; I < End; I++) + RawBody.reserve(RawBodySize); + for (std::size_t I = Start; I < End; ++I) RawBody += Tokens[I].RawBody; + CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody))); Parent->addChild(CurrentNode); } From b296e8fdf847f0041455e495b13ffb7aa6592e56 Mon Sep 17 00:00:00 2001 From: Aiden Grossman <aidengrossman@google.com> Date: Wed, 5 Nov 2025 00:50:14 +0000 Subject: [PATCH 257/313] [ProfCheck] Disable X86 AMX Test Case 4776451693f4a6bd18e50106edb4b3cfa766484f broke this because it started running an existing pass using the NewPM, which caused ProfCheck to catch existing issues. Disable it for now because we have not started looking at anything in the Codegen pipeline. This pass is also only enabled at O0 or if a function has optnone, so not super critical. --- llvm/utils/profcheck-xfail.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index d7af3a7ecbdee..53757c73fb8a6 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -75,6 +75,7 @@ CodeGen/Hexagon/masked_gather.ll CodeGen/NVPTX/lower-ctor-dtor.ll CodeGen/RISCV/zmmul.ll CodeGen/WebAssembly/memory-interleave.ll +CodeGen/X86/AMX/amx-low-intrinsics.ll CodeGen/X86/masked_gather_scatter.ll CodeGen/X86/nocfivalue.ll DebugInfo/AArch64/ir-outliner.ll From 15b19c732172d2d3cfbc108a268cbba03eed734f Mon Sep 17 00:00:00 2001 From: Michael Jones <michaelrj@google.com> Date: Tue, 4 Nov 2025 16:56:00 -0800 Subject: [PATCH 258/313] [libc] Fix fprintf_test assuming specific errnos. (#166479) The patch #166382 fixed most of these, but missed the fprintf_test ones. --- libc/test/src/stdio/fprintf_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp index 1b35a09645939..2cea7f554ce38 100644 --- a/libc/test/src/stdio/fprintf_test.cpp +++ b/libc/test/src/stdio/fprintf_test.cpp @@ -83,7 +83,7 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) { written = LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); - ASSERT_ERRNO_EQ(EBADF); + ASSERT_ERRNO_FAILURE(); ASSERT_EQ(printf_test::fclose(file), 0); } @@ -100,7 +100,7 @@ TEST(LlvmLibcFPrintfTest, NullPtrCheck) { int ret = LIBC_NAMESPACE::fprintf(file, "hello %n", static_cast<int *>(nullptr)); EXPECT_LT(ret, 0); - ASSERT_ERRNO_EQ(EINVAL); + ASSERT_ERRNO_FAILURE(); ASSERT_EQ(printf_test::fclose(file), 0); } From 1d0aa6c2ad62f0bb6cd2ddbde07251c5702d29f4 Mon Sep 17 00:00:00 2001 From: Amir Ayupov <aaupov@fb.com> Date: Tue, 4 Nov 2025 17:01:25 -0800 Subject: [PATCH 259/313] [BOLT] Fix impute-fall-throughs (#166305) BOLT expects pre-aggregated profile entries to be unique, which holds for externally aggregated traces (or branches+fall-through ranges). Therefore, BOLT doesn't merge duplicate entries for faster processing. However, such traces are not expressly prohibited and could come from concatenated pre-aggregated profiles or otherwise. Relax the assumption about no duplicate (branch-only) traces in fall- through imputing. Test Plan: updated callcont-fallthru.s --- bolt/lib/Profile/DataAggregator.cpp | 14 +++++++------- bolt/test/X86/callcont-fallthru.s | 9 +++++++++ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 4e062038a3e4c..8554683bc3cf8 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -564,13 +564,18 @@ void DataAggregator::imputeFallThroughs() { // Skip fall-throughs in external code. if (Trace.From == Trace::EXTERNAL) continue; - std::pair CurrentBranch(Trace.Branch, Trace.From); + if (std::pair CurrentBranch(Trace.Branch, Trace.From); + CurrentBranch != PrevBranch) { + // New group: reset aggregates. + AggregateCount = AggregateFallthroughSize = 0; + PrevBranch = CurrentBranch; + } // BR_ONLY must be the last trace in the group if (Trace.To == Trace::BR_ONLY) { // If the group is not empty, use aggregate values, otherwise 0-length // for unconditional jumps (call/ret/uncond branch) or 1-length for others uint64_t InferredBytes = - PrevBranch == CurrentBranch + AggregateFallthroughSize ? AggregateFallthroughSize / AggregateCount : !checkUnconditionalControlTransfer(Trace.From); Trace.To = Trace.From + InferredBytes; @@ -578,16 +583,11 @@ void DataAggregator::imputeFallThroughs() { << " bytes)\n"); ++InferredTraces; } else { - // Trace with a valid fall-through - // New group: reset aggregates. - if (CurrentBranch != PrevBranch) - AggregateCount = AggregateFallthroughSize = 0; // Only use valid fall-through lengths if (Trace.To != Trace::EXTERNAL) AggregateFallthroughSize += (Trace.To - Trace.From) * Info.TakenCount; AggregateCount += Info.TakenCount; } - PrevBranch = CurrentBranch; } if (opts::Verbosity >= 1) outs() << "BOLT-INFO: imputed " << InferredTraces << " traces\n"; diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s index 8c05491e7bca0..ef0bb55df1faf 100644 --- a/bolt/test/X86/callcont-fallthru.s +++ b/bolt/test/X86/callcont-fallthru.s @@ -15,6 +15,8 @@ # External return to a landing pad/entry point call continuation # RUN: link_fdata %s %t %t.pa-eret PREAGG-ERET # RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT +## Fall-through imputing test cases +# RUN: link_fdata %s %t %t.pa-imp PREAGG-IMP # RUN: llvm-strip --strip-unneeded %t -o %t.strip # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh @@ -63,6 +65,11 @@ # RUN-DISABLED: --check-prefix=CHECK-PLT # CHECK-PLT: traces mismatching disassembled function contents: 0 +## Check --impute-trace-fall-throughs accepting duplicate branch-only traces +# RUN: perf2bolt %t --pa -p %t.pa-imp -o %t.pa-imp.fdata --impute-trace-fall-through +# RUN: FileCheck %s --check-prefix=CHECK-IMP --input-file %t.pa-imp.fdata +# CHECK-IMP: 0 [unknown] 0 1 main {{.*}} 0 3 + .globl foo .type foo, %function foo: @@ -102,6 +109,8 @@ Ltmp1: Ltmp4: cmpl $0x0, -0x14(%rbp) +# PREAGG-IMP: B X:0 #Ltmp4_br# 1 0 +# PREAGG-IMP: B X:0 #Ltmp4_br# 2 0 Ltmp4_br: je Ltmp0 From 397415792929e09e07a65ffadc265966cc04db32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com> Date: Tue, 4 Nov 2025 17:21:37 -0800 Subject: [PATCH 260/313] AMDGPU: Pre-commit a test (#166414) --- llvm/test/CodeGen/AMDGPU/finalizebundle.mir | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir index 0548bcf304c32..d2ec1fcbac84f 100644 --- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir +++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir @@ -34,3 +34,16 @@ body: | $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec dead $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec ... + +--- +name: test_tied +body: | + bb.0: + ; CHECK-LABEL: name: test_tied + ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32, implicit $mode, implicit $exec { + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32 + ; CHECK-NEXT: [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec + ; CHECK-NEXT: } + %1:vgpr_32 = COPY %0:vgpr_32 + %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec +... From 1458d313a1cac6d2b255378e2b816092c6cbc4c7 Mon Sep 17 00:00:00 2001 From: Mircea Trofin <mtrofin@google.com> Date: Tue, 4 Nov 2025 17:23:55 -0800 Subject: [PATCH 261/313] [SLU][profcheck] Propagate profile for branches on injected conditions. (#164476) This patch addresses the profile of 2 branches: - one that compares the 2 limits, for which we have no information (the C1, C2, see https://reviews.llvm.org/D136233) - one that is conditioned on a condition for which we have a profile, so we reuse it Issue #147390 --- .../Transforms/Scalar/SimpleLoopUnswitch.cpp | 9 +- .../inject-invariant-conditions.ll | 142 +++++++++--------- 2 files changed, 79 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 86b2090081ed0..0577ddbd2353c 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -3203,10 +3203,15 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, Builder.SetInsertPoint(TI); auto *InvariantBr = Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock); + // We don't know anything about the relation between the limits. + setExplicitlyUnknownBranchWeightsIfProfiled( + *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE); Builder.SetInsertPoint(CheckBlock); - Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0), - TI->getSuccessor(1)); + Builder.CreateCondBr( + TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1), + !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof) + : nullptr); TI->eraseFromParent(); // Fixup phis. diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll index 536e0c6a0e74a..3c84dea2a0672 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll @@ -2,40 +2,40 @@ ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop(simple-loop-unswitch<nontrivial>),simplifycfg" | FileCheck %s ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop-mssa(simple-loop-unswitch<nontrivial>),simplifycfg" -verify-memoryssa | FileCheck %s -define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { +define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} { ; CHECK-LABEL: @test_01( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1:![0-9]+]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]] -; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] +; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]], !prof [[PROF2:![0-9]+]] ; CHECK: loop.us: -; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] -; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 -; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]] -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]] ; CHECK: guarded.us: -; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]] -; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] -; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 -; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]] +; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR_US]], align 4 +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] ; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] -; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 -; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[EL_PTR1:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV1]] +; CHECK-NEXT: [[EL1:%.*]] = load i32, ptr [[EL_PTR1]], align 4 +; CHECK-NEXT: [[BOUND_CHECK1:%.*]] = icmp ult i32 [[EL1]], [[LIMIT]] +; CHECK-NEXT: br i1 [[BOUND_CHECK1]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: guarded: -; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL1]], [[X]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: -; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] -; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL1]] +; CHECK-NEXT: store i32 [[IV1]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV1]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -76,7 +76,7 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_void_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_void_profile( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] @@ -133,7 +133,7 @@ range_check_failed: ; preds = %guarded define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_constants( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 200, 300 ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -141,7 +141,7 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 200 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 300 ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -154,13 +154,13 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], 200 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -200,17 +200,17 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_degenerate_profile( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF5:![0-9]+]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -257,17 +257,17 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_cold( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF6:![0-9]+]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF8:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -314,17 +314,17 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_overflowing_metadata(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_overflowing_metadata( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF7:![0-9]+]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF9:![0-9]+]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF9]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -371,7 +371,7 @@ range_check_failed: ; preds = %guarded define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_02( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -379,7 +379,7 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]] ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -392,16 +392,16 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -441,7 +441,7 @@ range_check_failed: ; preds = %guarded define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_02_inverse( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -449,7 +449,7 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp uge i32 [[EL_US]], [[X]] ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -462,16 +462,16 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp uge i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]], !prof [[PROF11:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -511,7 +511,7 @@ range_check_failed: ; preds = %guarded define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_03( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -519,7 +519,7 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp slt i32 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10:![0-9]+]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]] ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -532,16 +532,16 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i32 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -581,7 +581,7 @@ range_check_failed: ; preds = %guarded define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_04( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 128, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -589,7 +589,7 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i8, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp slt i8 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]] ; CHECK: guarded.us: ; CHECK-NEXT: [[EL_WIDE_US:%.*]] = zext i8 [[EL_US]] to i32 ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_WIDE_US]], [[X]] @@ -603,17 +603,17 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i8, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i8, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i8 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]] ; CHECK: guarded: ; CHECK-NEXT: [[EL_WIDE:%.*]] = zext i8 [[EL]] to i32 ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL_WIDE]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL_WIDE]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -651,17 +651,19 @@ range_check_failed: ; preds = %guarded ret i32 -2 } ;. -; CHECK: [[META0:![0-9]+]] = !{} -; CHECK: [[PROF1]] = !{!"branch_weights", i32 100, i32 1} -; CHECK: [[LOOP2]] = distinct !{!2, !3} -; CHECK: [[META3:![0-9]+]] = !{!"llvm.loop.unswitch.injection.disable"} -; CHECK: [[LOOP4]] = distinct !{!4, !3} -; CHECK: [[PROF5]] = !{!"branch_weights", i32 0, i32 0} -; CHECK: [[PROF6]] = !{!"branch_weights", i32 2, i32 3} -; CHECK: [[PROF7]] = !{!"branch_weights", i32 -1, i32 -1000} -; CHECK: [[LOOP8]] = distinct !{!8, !3} -; CHECK: [[LOOP9]] = distinct !{!9, !3} -; CHECK: [[PROF10]] = !{!"branch_weights", i32 1, i32 100} -; CHECK: [[LOOP11]] = distinct !{!11, !3} -; CHECK: [[LOOP12]] = distinct !{!12, !3} +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10} +; CHECK: [[META1]] = !{} +; CHECK: [[PROF2]] = !{!"unknown", !"simple-loop-unswitch"} +; CHECK: [[PROF3]] = !{!"branch_weights", i32 100, i32 1} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]} +; CHECK: [[META5]] = !{!"llvm.loop.unswitch.injection.disable"} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META5]]} +; CHECK: [[PROF7]] = !{!"branch_weights", i32 0, i32 0} +; CHECK: [[PROF8]] = !{!"branch_weights", i32 2, i32 3} +; CHECK: [[PROF9]] = !{!"branch_weights", i32 -1, i32 -1000} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]]} +; CHECK: [[PROF11]] = !{!"branch_weights", i32 1, i32 100} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META5]]} ;. From 4b367e0b85aa2d56f58c3e85f9a330019254f6a9 Mon Sep 17 00:00:00 2001 From: Kleis Auke Wolthuizen <github@kleisauke.nl> Date: Wed, 5 Nov 2025 02:35:15 +0100 Subject: [PATCH 262/313] [WebAssembly] Use IRBuilder in FixFunctionBitcasts (NFC) (#164268) Simplifies the code a bit. --- .../WebAssemblyFixFunctionBitcasts.cpp | 27 +++++++------------ .../unsupported-function-bitcasts.ll | 12 ++++----- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp index 37a34573bb339..9fef3e6d8b089 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp @@ -24,6 +24,7 @@ #include "WebAssembly.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" @@ -114,6 +115,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Wrapper->setAttributes(F->getAttributes()); BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper); const DataLayout &DL = BB->getDataLayout(); + IRBuilder<> Builder(BB); // Determine what arguments to pass. SmallVector<Value *, 4> Args; @@ -140,10 +142,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Args.push_back(&*AI); } else { if (CastInst::isBitOrNoopPointerCastable(ArgType, ParamType, DL)) { - Instruction *PtrCast = - CastInst::CreateBitOrPointerCast(AI, ParamType, "cast"); - PtrCast->insertInto(BB, BB->end()); - Args.push_back(PtrCast); + Args.push_back(Builder.CreateBitOrPointerCast(AI, ParamType, "cast")); } else if (ArgType->isStructTy() || ParamType->isStructTy()) { LLVM_DEBUG(dbgs() << "createWrapper: struct param type in bitcast: " << F->getName() << "\n"); @@ -166,24 +165,19 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { for (; AI != AE; ++AI) Args.push_back(&*AI); - CallInst *Call = CallInst::Create(F, Args, "", BB); + CallInst *Call = Builder.CreateCall(F, Args); - Type *ExpectedRtnType = F->getFunctionType()->getReturnType(); - Type *RtnType = Ty->getReturnType(); // Determine what value to return. if (RtnType->isVoidTy()) { - ReturnInst::Create(M->getContext(), BB); + Builder.CreateRetVoid(); } else if (ExpectedRtnType->isVoidTy()) { LLVM_DEBUG(dbgs() << "Creating dummy return: " << *RtnType << "\n"); - ReturnInst::Create(M->getContext(), PoisonValue::get(RtnType), BB); + Builder.CreateRet(PoisonValue::get(RtnType)); } else if (RtnType == ExpectedRtnType) { - ReturnInst::Create(M->getContext(), Call, BB); + Builder.CreateRet(Call); } else if (CastInst::isBitOrNoopPointerCastable(ExpectedRtnType, RtnType, DL)) { - Instruction *Cast = - CastInst::CreateBitOrPointerCast(Call, RtnType, "cast"); - Cast->insertInto(BB, BB->end()); - ReturnInst::Create(M->getContext(), Cast, BB); + Builder.CreateRet(Builder.CreateBitOrPointerCast(Call, RtnType, "cast")); } else if (RtnType->isStructTy() || ExpectedRtnType->isStructTy()) { LLVM_DEBUG(dbgs() << "createWrapper: struct return type in bitcast: " << F->getName() << "\n"); @@ -203,9 +197,8 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Wrapper = Function::Create(Ty, Function::PrivateLinkage, F->getName() + "_bitcast_invalid", M); Wrapper->setAttributes(F->getAttributes()); - BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper); - new UnreachableInst(M->getContext(), BB); - Wrapper->setName(F->getName() + "_bitcast_invalid"); + IRBuilder<> Builder(BasicBlock::Create(M->getContext(), "body", Wrapper)); + Builder.CreateUnreachable(); } else if (!WrapperNeeded) { LLVM_DEBUG(dbgs() << "createWrapper: no wrapper needed: " << F->getName() << "\n"); diff --git a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll index 9c638199bb6e6..1cfda8a821bd6 100644 --- a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll +++ b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll @@ -10,7 +10,7 @@ declare i32 @has_ptr_arg(ptr) ; CHECK-LABEL: test_invalid_rtn: ; CHECK: i32.const $push[[L0:[0-9]+]]=, 0{{$}} -; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.2, $pop[[L0]]{{$}} +; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.1, $pop[[L0]]{{$}} ; CHECK-NEXT: drop $pop[[L1]]{{$}} ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, 0{{$}} ; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid, $pop[[L0]]{{$}} @@ -32,7 +32,7 @@ define void @test_struct_rtn() { ; CHECK-LABEL: test_invalid_arg: ; CHECK: i32.const $push[[L0:[0-9]+]]=, 2{{$}} -; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.4, $pop[[L0]]{{$}} +; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.2, $pop[[L0]]{{$}} ; CHECK-NEXT: drop $pop[[L1]]{{$}} ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 2{{$}} ; CHECK-NEXT: call $push[[L1:[0-9]+]]=, has_ptr_arg, $pop[[L0]]{{$}} @@ -54,8 +54,8 @@ entry: ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function -; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.2: -; CHECK-NEXT: .functype .Lhas_i64_arg_bitcast_invalid.2 (i32) -> (i32) +; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.1: +; CHECK-NEXT: .functype .Lhas_i64_arg_bitcast_invalid.1 (i32) -> (i32) ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function @@ -64,7 +64,7 @@ entry: ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function -; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.4: -; CHECK-NEXT: .functype .Lhas_ptr_arg_bitcast_invalid.4 (i32) -> (i32) +; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.2: +; CHECK-NEXT: .functype .Lhas_ptr_arg_bitcast_invalid.2 (i32) -> (i32) ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function From cfca2297822de682daf53124d3ca5a55fb3bfee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com> Date: Tue, 4 Nov 2025 17:57:54 -0800 Subject: [PATCH 263/313] AMDGPU: Add and clarify reserved address spaces (#166486) Address spaces 10 and 11 are reserved for future use in the sense that we plain to upstream their use. Address space 12 is used by LLPC. It is used in a workaround for an issue with SMEM accesses to PRT buffers that is specific to the LLPC ecosystem and makes no sense to upstream. --- llvm/docs/AMDGPUUsage.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 7780c0a6dca0a..30b22a4a6d607 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -883,8 +883,9 @@ supported for the ``amdgcn`` target. Buffer Fat Pointer 7 N/A N/A 160 0 Buffer Resource 8 N/A V# 128 0x00000000000000000000000000000000 Buffer Strided Pointer (experimental) 9 *TODO* - *reserved for downstream use* 10 - *reserved for downstream use* 11 + *reserved for future use* 10 + *reserved for future use* 11 + *reserved for downstream use (LLPC)* 12 Streamout Registers 128 N/A GS_REGS ===================================== =============== =========== ================ ======= ============================ From d6fdfe0a27d3d76b8f2adff359cb099573a30f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com> Date: Tue, 4 Nov 2025 18:18:39 -0800 Subject: [PATCH 264/313] CodeGen: Record tied virtual register operands in finalizeBundle (#166209) This is in preparation of a future AMDGPU change where we are going to create bundles before register allocation and want to rely on the TwoAddressInstructionPass handling those bundles correctly. v2: - simplify the virtual register check and the test --- llvm/lib/CodeGen/MachineInstrBundle.cpp | 22 ++++++++++++++++++++- llvm/test/CodeGen/AMDGPU/finalizebundle.mir | 2 +- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index da29ffc9d2fed..88d81993fbe55 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -136,6 +136,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, SmallSetVector<Register, 8> ExternUses; SmallSet<Register, 8> KilledUseSet; SmallSet<Register, 8> UndefUseSet; + SmallVector<std::pair<Register, Register>> TiedOperands; for (auto MII = FirstMI; MII != LastMI; ++MII) { // Debug instructions have no effects to track. if (MII->isDebugInstr()) @@ -161,6 +162,15 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, // External def is now killed. KilledUseSet.insert(Reg); } + if (MO.isTied() && Reg.isVirtual()) { + // Record tied operand constraints that involve virtual registers so + // that bundles that are formed pre-register allocation reflect the + // relevant constraints. + unsigned TiedIdx = MII->findTiedOperandIdx(MO.getOperandNo()); + MachineOperand &TiedMO = MII->getOperand(TiedIdx); + Register DefReg = TiedMO.getReg(); + TiedOperands.emplace_back(DefReg, Reg); + } } } @@ -203,7 +213,17 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, bool isKill = KilledUseSet.contains(Reg); bool isUndef = UndefUseSet.contains(Reg); MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) | - getImplRegState(true)); + getImplRegState(true)); + } + + for (auto [DefReg, UseReg] : TiedOperands) { + unsigned DefIdx = + std::distance(LocalDefs.begin(), llvm::find(LocalDefs, DefReg)); + unsigned UseIdx = + std::distance(ExternUses.begin(), llvm::find(ExternUses, UseReg)); + assert(DefIdx < LocalDefs.size()); + assert(UseIdx < ExternUses.size()); + MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx); } } diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir index d2ec1fcbac84f..279f4298e6418 100644 --- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir +++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir @@ -40,7 +40,7 @@ name: test_tied body: | bb.0: ; CHECK-LABEL: name: test_tied - ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32, implicit $mode, implicit $exec { + ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32(tied-def 1), implicit $mode, implicit $exec { ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32 ; CHECK-NEXT: [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec ; CHECK-NEXT: } From d998f92a002bbdd78156716bad60523d7ddf1233 Mon Sep 17 00:00:00 2001 From: Abhay Kanhere <abhay@kanhere.net> Date: Tue, 4 Nov 2025 18:39:31 -0800 Subject: [PATCH 265/313] [CodeGen] MachineVerifier to check early-clobber constraint (#151421) Currently MachineVerifier is missing verifying early-clobber operand constraint. The only other machine operand constraint - TiedTo is already verified. --- llvm/lib/CodeGen/MachineVerifier.cpp | 8 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 15 +- .../GlobalISel/llvm.amdgcn.set.inactive.ll | 10 +- .../AMDGPU/GlobalISel/mul-known-bits.i64.ll | 16 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 1048 +++--- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 2564 +++++++------- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 1050 +++--- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 3064 ++++++++--------- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 842 ++--- llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 123 +- llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 266 +- .../CodeGen/AMDGPU/integer-mad-patterns.ll | 518 +-- .../AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll | 74 +- llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll | 466 ++- 14 files changed, 5068 insertions(+), 4996 deletions(-) diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index c0710c467a2e6..fdf10480b6e05 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -2584,6 +2584,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { report("Extra explicit operand on non-variadic instruction", MO, MONum); } + // Verify earlyClobber def operand + if (MCID.getOperandConstraint(MONum, MCOI::EARLY_CLOBBER) != -1) { + if (!MO->isReg()) + report("Early clobber must be a register", MI); + if (!MO->isEarlyClobber()) + report("Missing earlyClobber flag", MI); + } + switch (MO->getType()) { case MachineOperand::MO_Register: { // Verify debug flag on debug instructions. Check this first because reg0 diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 0c977416f1793..957d7164b686e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, if (!DstRC || DstRC != SrcRC) return false; - return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && - RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); + if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) || + !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) + return false; + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } + return true; } bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { @@ -602,6 +608,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); + I.getOperand(0).setIsEarlyClobber(true); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } @@ -3787,6 +3794,10 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { MI.removeOperand(1); // Intrinsic ID MI.addOperand(VDst_In); // Readd VDst_In to the end MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index e411c23c77bbe..7b5621ff3b5a9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: v_mov_b32_e32 v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) @@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, v1 +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 1cd9c0bfeb7e6..2351c969d5e49 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -165,10 +165,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v3, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -179,15 +179,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v3, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 637aaf7529364..7f10ee4c17450 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -546,10 +546,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GCN-NEXT: v_mov_b32_e32 v5, v2 +; GCN-NEXT: v_mov_b32_e32 v6, v1 +; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GCN-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i64: @@ -742,10 +743,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 ; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1] ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9] -; GCN-NEXT: v_mov_b32_e32 v2, v8 -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v10 +; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i96: @@ -758,8 +759,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0 ; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9] ; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i96: @@ -771,8 +772,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9] ; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i96: @@ -791,8 +792,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9] ; GFX12-NEXT: v_mov_b32_e32 v2, v8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i96: @@ -808,10 +809,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5 ; GFX1250-NEXT: v_mov_b32_e32 v8, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9] -; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 +; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i96 %num, %den ret i96 %result @@ -1071,18 +1072,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX7-NEXT: v_mov_b32_e32 v10, v2 +; GFX7-NEXT: v_mov_b32_e32 v11, v3 +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v12, v4 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] ; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX7-NEXT: v_mov_b32_e32 v2, v11 -; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX7-NEXT: v_mov_b32_e32 v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] +; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i128: @@ -1092,18 +1095,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, v2 +; GFX8-NEXT: v_mov_b32_e32 v11, v3 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v12, v4 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] ; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX8-NEXT: v_mov_b32_e32 v2, v11 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] +; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i128: @@ -1113,18 +1118,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] ; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i128: @@ -1138,11 +1145,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v2, v11 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4 +; GFX10-NEXT: v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12] +; GFX10-NEXT: v_mov_b32_e32 v2, v13 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12] +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v14, v7, s4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7] ; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6] @@ -1155,15 +1162,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4 ; GFX11-NEXT: v_mov_b32_e32 v12, v3 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0 -; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6 -; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7 +; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v2, v13 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v7, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4] ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1184,14 +1192,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v2, v11 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] +; GFX12-NEXT: v_mov_b32_e32 v2, v13 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v14, v7, s0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo @@ -1210,16 +1218,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1] ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11] -; GFX1250-NEXT: v_mov_b32_e32 v12, v1 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11] +; GFX1250-NEXT: v_mov_b32_e32 v10, v1 ; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mov_b32_e32 v13, v10 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13] +; GFX1250-NEXT: v_mov_b32_e32 v11, v12 +; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11] ; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13] -; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13, v8, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo ; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1 @@ -2401,207 +2409,216 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12 +; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] ; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc -; GFX7-NEXT: v_mov_b32_e32 v22, v18 -; GFX7-NEXT: v_mov_b32_e32 v18, v19 -; GFX7-NEXT: v_mov_b32_e32 v19, v16 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] +; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] +; GFX7-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc +; GFX7-NEXT: v_mov_b32_e32 v21, v22 +; GFX7-NEXT: v_mov_b32_e32 v22, v23 +; GFX7-NEXT: v_mov_b32_e32 v23, v18 +; GFX7-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] +; GFX7-NEXT: v_mul_lo_u32 v18, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX7-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] +; GFX7-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] +; GFX7-NEXT: v_mov_b32_e32 v20, v23 +; GFX7-NEXT: v_mul_lo_u32 v25, v4, v11 +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] +; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] +; GFX7-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] ; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX7-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX7-NEXT: v_mov_b32_e32 v21, v20 -; GFX7-NEXT: v_mov_b32_e32 v20, v11 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX7-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11] +; GFX7-NEXT: v_mov_b32_e32 v12, v22 +; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] +; GFX7-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] +; GFX7-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] ; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] +; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] ; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] ; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, v10 +; GFX7-NEXT: v_mov_b32_e32 v1, v13 +; GFX7-NEXT: v_mov_b32_e32 v2, v14 +; GFX7-NEXT: v_mov_b32_e32 v7, v11 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12 +; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] ; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc -; GFX8-NEXT: v_mov_b32_e32 v22, v18 -; GFX8-NEXT: v_mov_b32_e32 v18, v19 -; GFX8-NEXT: v_mov_b32_e32 v19, v16 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] +; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc +; GFX8-NEXT: v_mov_b32_e32 v21, v22 +; GFX8-NEXT: v_mov_b32_e32 v22, v23 +; GFX8-NEXT: v_mov_b32_e32 v23, v18 +; GFX8-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] +; GFX8-NEXT: v_mul_lo_u32 v18, v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX8-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] +; GFX8-NEXT: v_mov_b32_e32 v20, v23 +; GFX8-NEXT: v_mul_lo_u32 v25, v4, v11 +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] +; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] ; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX8-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX8-NEXT: v_mov_b32_e32 v21, v20 -; GFX8-NEXT: v_mov_b32_e32 v20, v11 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX8-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v12, v22 +; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] +; GFX8-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] ; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] +; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] ; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] ; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, v10 +; GFX8-NEXT: v_mov_b32_e32 v1, v13 +; GFX8-NEXT: v_mov_b32_e32 v2, v14 +; GFX8-NEXT: v_mov_b32_e32 v7, v11 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12 +; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] ; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v22, vcc -; GFX9-NEXT: v_mov_b32_e32 v22, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v16 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] +; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v24, vcc +; GFX9-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v18 +; GFX9-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] +; GFX9-NEXT: v_mul_lo_u32 v18, v6, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX9-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] +; GFX9-NEXT: v_mov_b32_e32 v20, v23 +; GFX9-NEXT: v_mul_lo_u32 v25, v4, v11 +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX9-NEXT: v_mov_b32_e32 v21, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v11 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11] ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v18, vcc +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: @@ -2609,68 +2626,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 -; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX10-NEXT: v_mul_lo_u32 v29, v4, v11 +; GFX10-NEXT: v_mul_lo_u32 v31, v3, v12 +; GFX10-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0 -; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19] -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21] -; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_mov_b32_e32 v20, v22 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20] -; GFX10-NEXT: v_mov_b32_e32 v20, v18 +; GFX10-NEXT: v_mul_lo_u32 v28, v17, v14 +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v12, 0 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] +; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] +; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19] +; GFX10-NEXT: v_mov_b32_e32 v18, v23 +; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 +; GFX10-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX10-NEXT: v_mov_b32_e32 v19, v24 +; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19] ; GFX10-NEXT: v_mov_b32_e32 v19, v22 -; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20] +; GFX10-NEXT: v_mul_lo_u32 v27, v16, v15 +; GFX10-NEXT: v_mov_b32_e32 v18, v21 +; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25] -; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22] ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19] ; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v14, v21 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v32, s6 +; GFX10-NEXT: v_mov_b32_e32 v14, v20 +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22] +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19] +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, 0, v15, s8 +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v4, v12, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v6, v13, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v9, v14, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v15, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v25, v27, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v28, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v30, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v31, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v29, s5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4 ; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2681,66 +2699,65 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7 ; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v12, 0 +; GFX11-NEXT: v_mul_lo_u32 v28, v16, v15 ; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14 -; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8] -; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX11-NEXT: v_mov_b32_e32 v20, v8 -; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, 1, s0 -; GFX11-NEXT: v_mov_b32_e32 v21, v22 -; GFX11-NEXT: v_mul_lo_u32 v22, v6, v9 -; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1] +; GFX11-NEXT: v_mul_lo_u32 v32, v3, v12 +; GFX11-NEXT: v_mul_lo_u32 v31, v2, v13 +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v12, 0 +; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21] +; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8] +; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] +; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1] ; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21] -; GFX11-NEXT: v_mov_b32_e32 v6, v25 -; GFX11-NEXT: v_mul_lo_u32 v25, v16, v15 -; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8] +; GFX11-NEXT: v_mov_b32_e32 v7, v23 +; GFX11-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 +; GFX11-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX11-NEXT: v_mov_b32_e32 v8, v24 +; GFX11-NEXT: v_mul_lo_u32 v24, v5, v10 +; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8] +; GFX11-NEXT: v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7] -; GFX11-NEXT: v_mul_lo_u32 v20, v2, v13 -; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2 -; GFX11-NEXT: v_mov_b32_e32 v11, v1 -; GFX11-NEXT: v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7] -; GFX11-NEXT: v_mul_lo_u32 v21, v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v12, v24 -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, s2 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14] -; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12] -; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2] -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s4 -; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v11, v3, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v26, v4, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v10, v5, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v27, v6, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v23, v25, s5 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22] +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] +; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v8, s2 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12] +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v1, s2 +; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22] +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7] +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, s4 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v8, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v27, v9, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v12, v6, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v7, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v25, v28, s5 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v20, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v21, s3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v32, s3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v22, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v23, s0 ; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2752,101 +2769,103 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mul_lo_u32 v29, v4, v11 +; GFX12-NEXT: v_mul_lo_u32 v31, v3, v12 +; GFX12-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 -; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] +; GFX12-NEXT: v_mul_lo_u32 v28, v17, v14 +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v12, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21] +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX12-NEXT: v_mov_b32_e32 v20, v22 +; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] +; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v10, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_mov_b32_e32 v18, v23 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 +; GFX12-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX12-NEXT: v_mov_b32_e32 v19, v24 +; GFX12-NEXT: v_mul_lo_u32 v24, v5, v10 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19] ; GFX12-NEXT: v_mov_b32_e32 v19, v22 -; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] +; GFX12-NEXT: v_mul_lo_u32 v27, v16, v15 +; GFX12-NEXT: v_mov_b32_e32 v18, v21 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 -; GFX12-NEXT: v_mov_b32_e32 v20, v18 -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25] -; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13 +; GFX12-NEXT: v_cndmask_b32_e64 v32, 0, 1, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22] ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2 -; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21 +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v32, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2] +; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v6, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v15, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v12, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v6, v13, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v9, v14, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v15, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v23, v22, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s4 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v25, v27, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v31, s3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v29, s1 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s0 ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2855,87 +2874,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10 -; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10 +; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] -; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo +; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v12, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22 -; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9 -; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v10, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24 +; GFX1250-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0 ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18 -; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22 +; GFX1250-NEXT: v_mov_b32_e32 v13, v18 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21] ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1] +; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2 ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21] -; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19] -; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24 -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2 +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mov_b32_e32 v12, v1 +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v33, s2 ; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13] -; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11] +; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11] +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28, v11, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v25, v2, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, v15 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v32, s4 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v31, s3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v30, s1 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo ; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v14 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2949,60 +2970,60 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0 +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_mul_u64_zext_with_vregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v2, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX8-NEXT: flat_load_dword v4, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_zext_with_vregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_u64_zext_with_vregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_u64_zext_with_vregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-NEXT: global_load_b32 v4, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_u64_zext_with_vregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v2, v[2:3], off +; GFX12-NEXT: global_load_b32 v4, v[2:3], off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: s_mul_u64_zext_with_vregs: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: global_load_b32 v2, v[2:3], off +; GFX1250-NEXT: global_load_b32 v4, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 @@ -3130,33 +3151,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4] +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 +; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_mul_u64_sext_with_vregs: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_sext_with_vregs: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; @@ -3183,17 +3207,17 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: s_mul_u64_sext_with_vregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v2, v[2:3], off +; GFX12-NEXT: global_load_b32 v4, v[2:3], off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 +; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: s_mul_u64_sext_with_vregs: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: global_load_b32 v2, v[2:3], off +; GFX1250-NEXT: global_load_b32 v4, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0 +; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 4f2c454e13356..01c601f0646b5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -31,128 +31,128 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v8, v6 ; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v8 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7 -; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 +; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc -; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10 -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7 -; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10 -; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v9 +; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v11, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7 +; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 -; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -220,65 +220,65 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s13, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s12, v1 ; CHECK-NEXT: v_mul_hi_u32 v4, s12, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, s13, v0 -; CHECK-NEXT: v_mul_hi_u32 v5, s13, v1 +; CHECK-NEXT: v_mov_b32_e32 v7, s13 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -291,39 +291,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v4, s13, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v5, s13 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v3, s11 -; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s13, v1 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v1, s11 +; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s13, v4 ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s11, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v5, s[0:1] +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s10, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s11, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 @@ -382,263 +382,263 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v4, vcc ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6] ; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v5 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 +; GISEL-NEXT: v_mul_lo_u32 v5, v16, v14 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v5, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v5 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2] ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13] ; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13] ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v4, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_trunc_f32_e32 v15, v11 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 ; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 ; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v10 +; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v10 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v4 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v14 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v15, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v22, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v18, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v11, vcc ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11 -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v11, v1, v13 +; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v15 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10 ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11 +; GISEL-NEXT: v_xor_b32_e32 v17, v2, v15 ; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v17, v1 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v1, v2 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v13 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v15, v5 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -667,100 +667,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; CGP-NEXT: v_trunc_f32_e32 v5, v4 ; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v18, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] +; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_mul_lo_u32 v18, v17, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v14, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v13 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v3 +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v15 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v5, v15 +; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v14, v4 +; CGP-NEXT: v_xor_b32_e32 v13, v10, v15 +; CGP-NEXT: v_mul_hi_u32 v10, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_mul_hi_u32 v14, v12, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v17, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v11, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v13, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v13, v4 ; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] @@ -771,13 +771,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 @@ -785,8 +785,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v15, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -840,126 +840,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v7, v6 ; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v16, v15, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v6 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v5 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v13 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v6 +; CGP-NEXT: v_xor_b32_e32 v14, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v8, v15, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v15, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v14, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v14, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v14, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v14, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v15, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v13, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -1049,82 +1049,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1133,40 +1133,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v1, -1, v3, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v4, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 1235195 ret i64 %result @@ -1215,46 +1215,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v16 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v16 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v18, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v17 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v14 +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v15 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v16 ; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 @@ -1263,46 +1263,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v18, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v15, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v20, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v18 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v20, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1319,74 +1319,74 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v20, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v9, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v9, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v6, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v9, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 @@ -1394,8 +1394,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -1406,12 +1406,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -1430,112 +1430,112 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 ; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] ; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] ; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 +; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 +; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc +; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v17, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc +; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v16 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc +; CGP-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v20, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc +; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v19, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc +; CGP-NEXT: v_cndmask_b32_e32 v14, v17, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 @@ -1553,72 +1553,72 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v12, v18, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1626,24 +1626,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195> ret <2 x i64> %result @@ -1679,126 +1679,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CHECK-NEXT: v_trunc_f32_e32 v7, v6 ; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v7 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v14, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v5 +; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v14, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v9 +; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v11, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -1850,8 +1850,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v8 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v5, vcc ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 @@ -1859,182 +1859,183 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v13, v11 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v7, v19, v11 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v7 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v7 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 ; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13] ; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 ; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13] ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v5, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_trunc_f32_e32 v15, v11 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 ; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 ; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v8 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v5 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12 +; GISEL-NEXT: v_mul_lo_u32 v8, v18, v14 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v15, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v22, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v18, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v12, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12 -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v12, v1, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v5, v1, v15 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11 ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 ; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2042,25 +2043,25 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc ; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v3, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v5, v1 ; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 @@ -2074,39 +2075,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v7 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v5 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v15, v6 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -2138,126 +2138,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 ; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 ; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 ; CGP-NEXT: v_trunc_f32_e32 v12, v11 ; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v17, v13, v10 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_mul_lo_u32 v12, v16, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v19, v16, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v19, v10 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v19, v12 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v19, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v16, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v16, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v14 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v16, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v10 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v8, v14 -; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v17, v9, v14 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v19, v14 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v10 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v17 +; CGP-NEXT: v_mul_lo_u32 v8, v19, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v16, v14 +; CGP-NEXT: v_xor_b32_e32 v18, v9, v17 +; CGP-NEXT: v_mul_hi_u32 v9, v16, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v16, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v9, v19, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v19, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v17, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v17, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v17, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v19, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v18, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v9 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v18, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v18, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v11, v15, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v18, v9 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v16, vcc, v12, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10] +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v18, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v18, v12 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5] -; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v1 +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v12, s[4:5] +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v14 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v16, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v14, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v17, v0 +; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 @@ -2313,128 +2313,128 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 ; CGP-NEXT: v_trunc_f32_e32 v10, v8 ; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v16, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] +; CGP-NEXT: v_mul_lo_u32 v6, v16, v8 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 +; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v16, v11 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v6 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v5, v12 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v16, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 +; CGP-NEXT: v_xor_b32_e32 v15, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v9 +; CGP-NEXT: v_mul_lo_u32 v6, v16, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v11, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v9 +; CGP-NEXT: v_mul_hi_u32 v8, v16, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v16, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v15, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v15, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v13, vcc, v9, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v12, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v14, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -2504,15 +2504,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v1, v4 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 @@ -2537,198 +2537,198 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v10, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GISEL-NEXT: v_trunc_f32_e32 v5, v4 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v3 ; GISEL-NEXT: v_mov_b32_e32 v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6] -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[5:6] +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v12 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v7 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 ; GISEL-NEXT: v_trunc_f32_e32 v8, v6 ; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v2 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v3 +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v8 ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 ; GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v17, v[2:3] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, v[7:8] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v2, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v2, v17, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v9 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v17, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v17, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v17, v9 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v2 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v17, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v12, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3] -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[2:3] +; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[6:7] ; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v8 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v5, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v12, 0 +; GISEL-NEXT: v_mul_hi_u32 v14, 0, v7 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v14, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v12, v[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -2736,8 +2736,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2748,8 +2748,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: @@ -2769,27 +2769,27 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v5, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v1 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v3 ; CGP-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v2 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v3 -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v6, v1 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v2 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v8, v3 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v6, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v7 -; CGP-NEXT: v_mul_lo_u32 v5, v2, v4 +; CGP-NEXT: v_mul_lo_u32 v6, v2, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 1441591a5fcce..f4489c2239fda 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -175,65 +175,65 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v8, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v4 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0 -; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 +; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc +; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v5 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v0, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_xor_b32_e32 v1, s1, v6 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v5 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_xor_b32_e32 v4, s2, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3 @@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s17, 31 ; GFX9-NEXT: s_ashr_i32 s4, s19, 31 @@ -335,64 +336,63 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3] +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s1, v6 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s2, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_xor_b32_e32 v3, s2, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_i64: @@ -1311,68 +1311,68 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] ; GFX8-NEXT: s_ashr_i32 s6, s19, 31 ; GFX8-NEXT: s_mov_b32 s7, s6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -1385,38 +1385,38 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s11 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_ashr_i32 s10, s3, 31 -; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] -; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s8, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v3 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1] ; GFX8-NEXT: s_add_u32 s0, s18, s6 ; GFX8-NEXT: s_addc_u32 s1, s19, s6 ; GFX8-NEXT: s_add_u32 s2, s2, s10 @@ -1424,15 +1424,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_addc_u32 s3, s3, s10 ; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v4 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc ; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -1441,151 +1441,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 ; GFX8-NEXT: s_sub_u32 s5, 0, s2 -; GFX8-NEXT: s_subb_u32 s20, 0, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2] +; GFX8-NEXT: s_subb_u32 s20, 0, s3 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v1, v11, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] ; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v11, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, s16, v6 ; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] +; GFX8-NEXT: v_xor_b32_e32 v5, s17, v7 +; GFX8-NEXT: v_mov_b32_e32 v6, s17 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3 -; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v5, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] +; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9 +; GFX8-NEXT: v_mul_lo_u32 v7, v10, v5 +; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2 +; GFX8-NEXT: v_xor_b32_e32 v6, s4, v8 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v11, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, v11, v5 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v7, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9 -; GFX8-NEXT: v_mul_hi_u32 v3, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_mul_hi_u32 v5, v11, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v10, s8, v7 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, s8, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, s9, v7 ; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s8, v7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s9 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3 +; GFX8-NEXT: v_mul_hi_u32 v7, s9, v7 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v7, v6 +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v12, s9 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] +; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc +; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s9, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2 -; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s2, v2 +; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14 +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v10 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v14 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11 +; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v9 ; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX8-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9 -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX8-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6 -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GFX8-NEXT: v_xor_b32_e32 v7, s6, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, s6, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, s6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v7 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v8, v9, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -1622,66 +1622,67 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] ; GFX9-NEXT: s_ashr_i32 s6, s19, 31 ; GFX9-NEXT: s_mov_b32 s7, s6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1693,51 +1694,50 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: s_ashr_i32 s10, s3, 31 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s18, s6 ; GFX9-NEXT: s_addc_u32 s1, s19, s6 ; GFX9-NEXT: s_add_u32 s2, s2, s10 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s3, s3, s10 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v14 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -1747,31 +1747,31 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 ; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v12, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v13, vcc ; GFX9-NEXT: s_subb_u32 s20, 0, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v5, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3] +; GFX9-NEXT: v_mul_lo_u32 v2, v12, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v17, v4 ; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v13, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1 +; GFX9-NEXT: v_mul_lo_u32 v10, v12, v4 +; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v17, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 @@ -1779,119 +1779,119 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 -; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v2, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, s16, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v8, s17, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, s17 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, v11, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v12, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v11, v5 ; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 +; GFX9-NEXT: v_mul_hi_u32 v6, v10, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v11, v5 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3 +; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, s4, v9 ; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 +; GFX9-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX9-NEXT: v_mov_b32_e32 v8, s4 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 -; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: v_add3_u32 v11, v6, v11, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v12, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 -; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 +; GFX9-NEXT: v_sub_u32_e32 v7, s9, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s2, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11 +; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v9 ; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_mov_b32_e32 v7, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-NEXT: v_xor_b32_e32 v7, s6, v9 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX9-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc +; GFX9-NEXT: v_xor_b32_e32 v8, s6, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc ; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13] ; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 40b5db0a15447..6f42239cd191d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v6, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v6 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v11 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v3, v11 +; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v10, v8 +; CHECK-NEXT: v_xor_b32_e32 v12, v4, v11 +; CHECK-NEXT: v_mul_hi_u32 v4, v10, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v13, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -214,65 +214,65 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1 ; CHECK-NEXT: v_mul_hi_u32 v4, s10, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, s11, v0 -; CHECK-NEXT: v_mul_hi_u32 v5, s11, v1 +; CHECK-NEXT: v_mov_b32_e32 v7, s11 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -285,19 +285,19 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v4, s11, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v5, s11 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v3, s9 -; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s11, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s11, v4 +; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 ; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] @@ -372,84 +372,84 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v8, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v11, v9 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v11 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v9 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v0 ; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 ; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 @@ -457,148 +457,148 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v9 ; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2] +; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v7 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v16 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2] ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v12, v10 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v12, v9 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1] -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, v13, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1] +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v15, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13] +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v14, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v14, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v19, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v19, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v8 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v15, v8 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v1 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v10, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9] ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v14, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v9, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 @@ -651,128 +651,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CGP-NEXT: v_trunc_f32_e32 v4, v3 ; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v16, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v3 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v12 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v2 +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] +; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v15 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v3, v15 +; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 +; CGP-NEXT: v_xor_b32_e32 v16, v4, v15 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v10, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v17, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v17, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v16, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v16, v2 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v16, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v5, v13, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5] +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0 -; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc +; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v12 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v12 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v15 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v15 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v15, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 @@ -820,128 +820,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v5 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] +; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v6, v10 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v11, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v4 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v5, vcc +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v13 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 +; CGP-NEXT: v_xor_b32_e32 v14, v6, v13 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v15, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v14, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v14, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -977,82 +977,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xfffff000 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1060,39 +1060,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 4096 ret i64 %result @@ -1141,92 +1141,92 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7 +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1243,74 +1243,74 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1330,10 +1330,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: @@ -1352,110 +1352,110 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 ; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] ; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] ; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 +; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 +; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 @@ -1473,72 +1473,72 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1558,10 +1558,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, <i64 4096, i64 4096> ret <2 x i64> %result @@ -1573,82 +1573,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1656,39 +1656,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 1235195 ret i64 %result @@ -1737,92 +1737,92 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7 +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1839,74 +1839,74 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1926,10 +1926,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: @@ -1948,110 +1948,110 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 ; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] ; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] ; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 +; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 +; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 @@ -2069,72 +2069,72 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -2154,10 +2154,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, <i64 1235195, i64 1235195> ret <2 x i64> %result @@ -2193,130 +2193,130 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v7, v5 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v7 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] +; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v5, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9 -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] +; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v11 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v2, v11 +; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8 +; CHECK-NEXT: v_xor_b32_e32 v12, v3, v11 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6 +; CHECK-NEXT: v_mul_lo_u32 v3, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6 +; CHECK-NEXT: v_mul_hi_u32 v4, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2361,85 +2361,85 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v12, v10 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v12 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v13, v10 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v18, v10 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v18, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v11 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v0, v18, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v1, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v18, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v16, v10 ; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6 ; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 @@ -2448,127 +2448,127 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v8, v6 ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0 ; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8 ; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v8 ; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1] -; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10 +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1] +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v14, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11] +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v14 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 ; GISEL-NEXT: v_trunc_f32_e32 v13, v10 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 -; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v10 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v20, v7 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v20, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v11, s[6:7] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v14, v7 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v7 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v19, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v19, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v7, v20, v7, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v11 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 ; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 ; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 @@ -2577,19 +2577,19 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 @@ -2645,103 +2645,103 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v12, v10 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v18, v12 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v12, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v18, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; CGP-NEXT: v_mul_hi_u32 v11, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v15, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v13, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v13 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v17, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v18, v10 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; CGP-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v4, v14 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v9, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v16 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v14, v4, v16 +; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 +; CGP-NEXT: v_mul_lo_u32 v9, v15, v13 +; CGP-NEXT: v_xor_b32_e32 v17, v8, v16 +; CGP-NEXT: v_mul_hi_u32 v8, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v16, v11 +; CGP-NEXT: v_mul_lo_u32 v8, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 +; CGP-NEXT: v_mul_hi_u32 v9, v15, v13 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 +; CGP-NEXT: v_mul_hi_u32 v10, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v15, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v18, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v17, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v15, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v17, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v14, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v17, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 ; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10] +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v11, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v11 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 @@ -2754,11 +2754,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc @@ -2766,10 +2766,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v14 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v14 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v16 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v16 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc ; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 @@ -2819,117 +2819,117 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] ; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v6, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v14 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v4, v14 ; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 +; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_mul_lo_u32 v5, v6, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v6, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v10, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v10, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc @@ -2938,11 +2938,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v14 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v14 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -3004,15 +3004,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v1, v4 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3 @@ -3035,196 +3035,196 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GISEL-NEXT: v_trunc_f32_e32 v5, v4 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 +; GISEL-NEXT: v_mul_lo_u32 v0, v13, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v13, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v7 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v7 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8 -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v12, 0 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1] -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v7, v[0:1] +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v11, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] +; GISEL-NEXT: v_trunc_f32_e32 v9, v7 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v0 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v9 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v12, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v16, v[0:1] +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v11, v4 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v10 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7 +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v16, v7 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v5, v13, v10 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v10 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v4 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v16, v5, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, 0 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v10, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v9, v[5:6] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v10, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v6 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v6 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v9, v[5:6] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v7 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -3264,15 +3264,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v7, v1, v3 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 9e412b6c7cd0a..23ef596c021c2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -132,65 +132,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -203,54 +202,55 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2] -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s8, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v2, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v0 +; GFX8-NEXT: v_subb_u32_e64 v5, s[0:1], v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v6 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v3 ; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v4 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s10, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 -; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc +; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v15, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v14, vcc ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -271,63 +271,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -339,53 +340,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2] +; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v7, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s19 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v8 -; GFX9-NEXT: v_sub_u32_e32 v0, s17, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v7 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s16, v0 +; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v2, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v5 +; GFX9-NEXT: v_sub_u32_e32 v0, s17, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s18, v7 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s18, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[12:13] -; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[14:15] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v14, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] +; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v8, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i64: @@ -1005,72 +1005,72 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v10, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 ; GFX8-NEXT: s_sub_u32 s2, 0, s14 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: s_subb_u32 s3, 0, s15 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -1083,136 +1083,136 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0 -; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v2 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v5, vcc +; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v10, vcc ; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v1 -; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc +; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, s12, v1 +; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc ; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX8-NEXT: v_trunc_f32_e32 v4, v3 ; GFX8-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v7 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX8-NEXT: v_mul_lo_u32 v17, v12, v3 -; GFX8-NEXT: v_mul_hi_u32 v6, v12, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v2 +; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v8 +; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v9, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v18, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] +; GFX8-NEXT: v_mul_lo_u32 v4, v18, v2 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v12, v10, vcc +; GFX8-NEXT: v_mul_lo_u32 v5, v15, v6 +; GFX8-NEXT: v_mul_hi_u32 v10, v15, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, v18, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v14 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v17, v4 -; GFX8-NEXT: v_mul_hi_u32 v17, v12, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v5, vcc +; GFX8-NEXT: v_mul_lo_u32 v10, v18, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_mul_hi_u32 v5, v15, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v16 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v13 +; GFX8-NEXT: v_mul_hi_u32 v6, v18, v6 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v7, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v2 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v3, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v2 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v18, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v10, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v7, v15, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, v12, v5 -; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6] +; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v12, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, v18, v4 +; GFX8-NEXT: v_mul_lo_u32 v8, v15, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v10, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v15, v5 -; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, v18, v7 +; GFX8-NEXT: v_mul_hi_u32 v4, v18, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_mul_hi_u32 v8, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 -; GFX8-NEXT: v_mul_hi_u32 v5, v15, v5 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, s10, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v7, v18, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v18, v6, vcc +; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4 +; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] ; GFX8-NEXT: v_mul_hi_u32 v1, s10, v4 ; GFX8-NEXT: v_mul_hi_u32 v4, s11, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v5 +; GFX8-NEXT: v_mul_lo_u32 v5, s11, v7 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, s10, v5 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v5, v8 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, s11, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, s11, v7 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v7, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] @@ -1279,60 +1279,61 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 ; GFX9-NEXT: s_sub_u32 s2, 0, s6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX9-NEXT: s_subb_u32 s3, 0, s7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 @@ -1349,114 +1350,113 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2] ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v9, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v8, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v13, vcc, s4, v1 +; GFX9-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v4, v3 ; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 ; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v5, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v12, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v2 +; GFX9-NEXT: v_add_co_u32_e64 v16, s[0:1], 1, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v4 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] +; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v12, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, v15, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v15, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v18, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v14 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v8, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX9-NEXT: v_mul_hi_u32 v5, v15, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v18, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v5, v17 -; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc +; GFX9-NEXT: v_add_u32_e32 v5, v8, v5 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v17, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2 -; GFX9-NEXT: v_add3_u32 v3, v5, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc +; GFX9-NEXT: v_add3_u32 v4, v5, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, v15, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, v18, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v13 ; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, v12, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v12, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v18, vcc -; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v15, v[5:6] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX9-NEXT: v_mul_lo_u32 v5, v18, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v15, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v9, v15, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v18, v4 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v9, v15, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, v12, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] +; GFX9-NEXT: v_mul_lo_u32 v9, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, v15, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v18, v7 ; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3] ; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v6 -; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_add3_u32 v5, v7, v6, v5 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v12, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] +; GFX9-NEXT: v_add3_u32 v5, v6, v5, v7 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v15, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v18, v5, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v6, s19, v4 ; GFX9-NEXT: v_mul_lo_u32 v7, s18, v5 ; GFX9-NEXT: v_mul_hi_u32 v9, s18, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v13, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 0fc54aeaef77b..26f77898faf60 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -2407,51 +2407,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19 -; GISEL-NEXT: v_mul_lo_u32 v25, v29, v18 +; GISEL-NEXT: v_mul_lo_u32 v27, v30, v19 +; GISEL-NEXT: v_mul_lo_u32 v36, v29, v18 ; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0 -; GISEL-NEXT: v_mul_lo_u32 v26, v35, v3 -; GISEL-NEXT: v_mul_lo_u32 v27, v34, v2 +; GISEL-NEXT: v_mul_lo_u32 v37, v35, v3 +; GISEL-NEXT: v_mul_lo_u32 v38, v34, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23] ; GISEL-NEXT: v_mov_b32_e32 v22, v19 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3] -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15] -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2] -; GISEL-NEXT: v_mov_b32_e32 v23, v14 -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2] -; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23] -; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v25, vcc +; GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15] +; GISEL-NEXT: v_mov_b32_e32 v2, v23 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2] +; GISEL-NEXT: v_mov_b32_e32 v23, v25 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15] +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2] +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v36, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v28 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v18 -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v16, v12, v33 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4] -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v14, v14, v33 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v15, v28 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v22, vcc +; GISEL-NEXT: v_xor_b32_e32 v19, v0, v28 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v18 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v18, v2, v33 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v19, v28 ; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4] -; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v16, v33 -; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9] -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 -; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v23, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13] +; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v18, v33 +; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v23, vcc +; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33 ; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28 -; GISEL-NEXT: v_xor_b32_e32 v8, v3, v33 -; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v2, v28, s[6:7] +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v33 +; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v3, v28, s[6:7] ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9] ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc @@ -3216,36 +3217,38 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0 -; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21 -; GISEL-NEXT: v_mul_lo_u32 v29, v9, v20 +; GISEL-NEXT: v_mul_lo_u32 v34, v8, v21 +; GISEL-NEXT: v_mul_lo_u32 v35, v9, v20 ; GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0 ; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v30, v12, v19 -; GISEL-NEXT: v_mul_lo_u32 v31, v13, v18 +; GISEL-NEXT: v_mul_lo_u32 v36, v12, v19 +; GISEL-NEXT: v_mul_lo_u32 v37, v13, v18 ; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23] ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27] -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18] -; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22] -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18] -; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22] -; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v29, vcc +; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19] +; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23] +; GISEL-NEXT: v_mov_b32_e32 v18, v26 +; GISEL-NEXT: v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18] +; GISEL-NEXT: v_mov_b32_e32 v22, v28 +; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22] +; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31] +; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18] +; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v35, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v21, vcc +; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v13, v37, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18] -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17] -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19] -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v13, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v8, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[13:14] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v11, v32, v[16:17] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v15, v24, v[18:19] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v22, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = urem <2 x i128> %lhs, %rhs ret <2 x i128> %shl diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index e0421575c3174..460f1211d1386 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -237,31 +237,31 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 @@ -275,17 +275,18 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] -; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v4 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB0_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: .LBB0_7: ; %Flow2 @@ -604,31 +605,31 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 @@ -642,17 +643,18 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] -; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v4 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB1_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: .LBB1_7: ; %Flow2 @@ -962,31 +964,31 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 @@ -999,12 +1001,14 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 +; GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: .LBB2_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB2_7: ; %Flow2 @@ -1314,31 +1318,31 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 @@ -1351,12 +1355,14 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 +; GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: .LBB3_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB3_7: ; %Flow2 @@ -1702,31 +1708,31 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB6_4: ; %Flow @@ -2050,31 +2056,31 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB7_4: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 31b6b533866d4..f705a2ffc4f1d 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5775,28 +5775,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v8 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5831,28 +5831,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v5 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, v0, v8 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v4 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5883,28 +5883,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, v4, v7, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v5 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v2, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v6, v9, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v9, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5935,29 +5935,29 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX90A-GISEL: ; %bb.0: ; %entry ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v7, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v2, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v3, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, v[6:7] -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v2 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v5, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, v[0:1] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v4, v3, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v7, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v8 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v9, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v2, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v2, v[6:7] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v5, v8 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v6, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, v[0:1] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v3, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[4:5] ; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6408,52 +6408,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] +; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v14 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v16 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v13 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6513,52 +6513,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v14 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v16 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 1, v13 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6610,52 +6610,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v14 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v12, v15, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v16 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v8, v17, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v12, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v8, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v9 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6707,54 +6707,54 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX90A-GISEL: ; %bb.0: ; %entry ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v5, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v4, v[2:3] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v7, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v10 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v6, v[8:9] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, v1, v11, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v8 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v2, v12 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v5, 0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v3, v13, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v4, v[10:11] -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v7, 0 -; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v6, v[10:11] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v8 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v6, v[0:1] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v9, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v5, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v13, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v12, v[2:3] -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v15, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[2:3] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v8 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v7, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v12 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v6, v[8:9] +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v1, v13, vcc +; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v10 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v2, v14 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v5, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v3, v15, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[10:11] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, 0 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v12 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v6, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v6, v[10:11] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v10, v5, v12 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v2 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v3, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v8 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v9, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v5, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v5, v[0:1] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v4 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v6 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v11, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v11, v[0:1] +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v10, vcc +; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v7, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v14, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v13, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v13, v[4:5] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v16, 0 ; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v15, v[4:5] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v6 ; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 1ab4cb0f00192..d82d6bcb437cc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -781,16 +781,23 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; GISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39 ; GISEL12-NEXT: s_wait_kmcnt 0x0 ; GISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL12-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1 -; GISEL12-NEXT: v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3 -; GISEL12-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5 -; GISEL12-NEXT: v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7 -; GISEL12-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9 -; GISEL12-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11 -; GISEL12-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13 -; GISEL12-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15 +; GISEL12-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GISEL12-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3 +; GISEL12-NEXT: v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5 +; GISEL12-NEXT: v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7 +; GISEL12-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9 +; GISEL12-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11 +; GISEL12-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13 +; GISEL12-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15 ; GISEL12-NEXT: s_mov_b32 exec_lo, s9 -; GISEL12-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec +; GISEL12-NEXT: v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41 +; GISEL12-NEXT: v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43 +; GISEL12-NEXT: v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45 +; GISEL12-NEXT: v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47 +; GISEL12-NEXT: v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49 +; GISEL12-NEXT: v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51 +; GISEL12-NEXT: v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53 +; GISEL12-NEXT: v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55 ; GISEL12-NEXT: .LBB5_2: ; %tail ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s4 @@ -946,24 +953,39 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10-NEXT: s_swappc_b64 s[30:31], s[12:13] -; GISEL10-NEXT: v_mov_b32_e32 v24, v0 -; GISEL10-NEXT: v_mov_b32_e32 v25, v1 -; GISEL10-NEXT: v_mov_b32_e32 v26, v2 -; GISEL10-NEXT: v_mov_b32_e32 v27, v3 -; GISEL10-NEXT: v_mov_b32_e32 v28, v4 -; GISEL10-NEXT: v_mov_b32_e32 v29, v5 -; GISEL10-NEXT: v_mov_b32_e32 v30, v6 -; GISEL10-NEXT: v_mov_b32_e32 v31, v7 -; GISEL10-NEXT: v_mov_b32_e32 v32, v8 -; GISEL10-NEXT: v_mov_b32_e32 v33, v9 -; GISEL10-NEXT: v_mov_b32_e32 v34, v10 -; GISEL10-NEXT: v_mov_b32_e32 v35, v11 -; GISEL10-NEXT: v_mov_b32_e32 v36, v12 -; GISEL10-NEXT: v_mov_b32_e32 v37, v13 -; GISEL10-NEXT: v_mov_b32_e32 v38, v14 -; GISEL10-NEXT: v_mov_b32_e32 v39, v15 +; GISEL10-NEXT: v_mov_b32_e32 v40, v0 +; GISEL10-NEXT: v_mov_b32_e32 v41, v1 +; GISEL10-NEXT: v_mov_b32_e32 v42, v2 +; GISEL10-NEXT: v_mov_b32_e32 v43, v3 +; GISEL10-NEXT: v_mov_b32_e32 v44, v4 +; GISEL10-NEXT: v_mov_b32_e32 v45, v5 +; GISEL10-NEXT: v_mov_b32_e32 v46, v6 +; GISEL10-NEXT: v_mov_b32_e32 v47, v7 +; GISEL10-NEXT: v_mov_b32_e32 v48, v8 +; GISEL10-NEXT: v_mov_b32_e32 v49, v9 +; GISEL10-NEXT: v_mov_b32_e32 v50, v10 +; GISEL10-NEXT: v_mov_b32_e32 v51, v11 +; GISEL10-NEXT: v_mov_b32_e32 v52, v12 +; GISEL10-NEXT: v_mov_b32_e32 v53, v13 +; GISEL10-NEXT: v_mov_b32_e32 v54, v14 +; GISEL10-NEXT: v_mov_b32_e32 v55, v15 ; GISEL10-NEXT: s_mov_b32 exec_lo, s9 -; GISEL10-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec +; GISEL10-NEXT: v_mov_b32_e32 v24, v40 +; GISEL10-NEXT: v_mov_b32_e32 v25, v41 +; GISEL10-NEXT: v_mov_b32_e32 v26, v42 +; GISEL10-NEXT: v_mov_b32_e32 v27, v43 +; GISEL10-NEXT: v_mov_b32_e32 v28, v44 +; GISEL10-NEXT: v_mov_b32_e32 v29, v45 +; GISEL10-NEXT: v_mov_b32_e32 v30, v46 +; GISEL10-NEXT: v_mov_b32_e32 v31, v47 +; GISEL10-NEXT: v_mov_b32_e32 v32, v48 +; GISEL10-NEXT: v_mov_b32_e32 v33, v49 +; GISEL10-NEXT: v_mov_b32_e32 v34, v50 +; GISEL10-NEXT: v_mov_b32_e32 v35, v51 +; GISEL10-NEXT: v_mov_b32_e32 v36, v52 +; GISEL10-NEXT: v_mov_b32_e32 v37, v53 +; GISEL10-NEXT: v_mov_b32_e32 v38, v54 +; GISEL10-NEXT: v_mov_b32_e32 v39, v55 ; GISEL10-NEXT: .LBB5_2: ; %tail ; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GISEL10-NEXT: v_mov_b32_e32 v8, v24 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 4d5ade4abcef7..1b4ed67eb6eea 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -2481,10 +2481,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2502,10 +2503,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2524,8 +2526,8 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v4, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v2, v[6:7] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v2, v[6:7] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v8 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2626,9 +2628,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2654,9 +2656,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2677,12 +2679,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v2, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v1, v2, v[8:9] +; GFX9-GISEL-NEXT: v_add_u32_e32 v8, v7, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v4, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v4, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2816,10 +2818,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2853,10 +2855,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2881,16 +2883,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v0, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v4, v[10:11] -; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v7, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v3, v6, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v5, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v2, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v3, v6, v[0:1] +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v12 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v8, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v9, v4, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v9, v4, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -3068,31 +3070,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v21 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3139,31 +3139,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v21 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3204,32 +3202,32 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v0, v9, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v8, v[18:19] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v1, v8, v[18:19] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v11, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v10, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v3, v10, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v13, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v12, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v4 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v12, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v15, 0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v18 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v14, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v14, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v7, v14, v[8:9] +; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v20 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v16, v1, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v6 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v2, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v16, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v17, v2, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v2 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v5, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v10, v4, v[0:1] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v4, v[8:9] ; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v7, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v2, v[4:5] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, v2, v[4:5] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3550,63 +3548,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] -; GFX7-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] +; GFX7-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v10 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3695,63 +3693,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] -; GFX8-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] +; GFX8-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v10 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3827,65 +3825,65 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX9-GISEL-NEXT: scratch_load_dword v31, off, s32 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v0, v17, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[0:1], v0, v16, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v16, v[34:35] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[0:1], v1, v16, v[34:35] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v2, v19, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v18, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v3, v18, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v4, v21, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v20, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v20, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v5, v20, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v6, v23, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v19, v3, v4 +; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v3, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v22, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v22, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v7, v22, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v8, v25, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v5, v6 +; GFX9-GISEL-NEXT: v_add_u32_e32 v21, v5, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v24, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v9, v24, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v24, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v10, v27, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v8 +; GFX9-GISEL-NEXT: v_add_u32_e32 v34, v1, v34 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v26, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v11, v26, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v11, v26, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v12, v29, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v28, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v13, v28, v[16:17] -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v12 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v13, v28, v[16:17] +; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v36 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v14, v30, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v14, v31, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v15, v30, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v30, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v32, v1, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v13, v14 +; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v13, v18 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v33, v6, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v32, v6, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v33, v6, v[16:17] -; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v15, v6 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v3, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v8, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v34, v8, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v5, 0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v13, v15, v18 +; GFX9-GISEL-NEXT: v_add_u32_e32 v15, v7, v0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v5, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v19, v10, v[2:3] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v20, v10, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v4, v9, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v20, v12, v[4:5] -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v21, v12, v[8:9] ; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v14, v0, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v11, v0, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v13, v0, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v2, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v9, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v15, v2, v[0:1] +; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v5, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v9, v6 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v8, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v10, v8, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v16i64: From 5c8bcf7dc7461097a0d2b1a43fc46ab37363adc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com> Date: Tue, 4 Nov 2025 16:44:58 -1000 Subject: [PATCH 266/313] [flang][cuda][NFC] Move CUDA intrinsics lowering to a separate file (#166461) Just move all CUDA related intrinsics lowering to a separate file to avoid clobbering the main Fortran intrinsic file. --- .../Optimizer/Builder/CUDAIntrinsicCall.h | 95 + .../flang/Optimizer/Builder/IntrinsicCall.h | 64 - flang/lib/Optimizer/Builder/CMakeLists.txt | 1 + .../Optimizer/Builder/CUDAIntrinsicCall.cpp | 1588 +++++++++++++++++ flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 1346 +------------- 5 files changed, 1691 insertions(+), 1403 deletions(-) create mode 100644 flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h create mode 100644 flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h new file mode 100644 index 0000000000000..d735ce95a83dc --- /dev/null +++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h @@ -0,0 +1,95 @@ +//==-- Builder/CUDAIntrinsicCall.h - lowering of CUDA intrinsics ---*-C++-*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_LOWER_CUDAINTRINSICCALL_H +#define FORTRAN_LOWER_CUDAINTRINSICCALL_H + +#include "flang/Optimizer/Builder/IntrinsicCall.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" + +namespace fir { + +struct CUDAIntrinsicLibrary : IntrinsicLibrary { + + // Constructors. + explicit CUDAIntrinsicLibrary(fir::FirOpBuilder &builder, mlir::Location loc) + : IntrinsicLibrary(builder, loc) {} + CUDAIntrinsicLibrary() = delete; + CUDAIntrinsicLibrary(const CUDAIntrinsicLibrary &) = delete; + + // CUDA intrinsic handlers. + mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicAddR2(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + template <int extent> + fir::ExtendedValue genAtomicAddVector(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicCas(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicExch(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicXor(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>); + template <const char *fctName, int extent> + fir::ExtendedValue genLDXXFunc(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>); + template <typename OpTy> + mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>); + void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>); + void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>); + template <mlir::NVVM::VoteSyncKind kind> + mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>); +}; + +const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name); + +} // namespace fir + +#endif // FORTRAN_LOWER_CUDAINTRINSICCALL_H diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index b64419f5ae6da..01d27fd5fc399 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -19,7 +19,6 @@ #include "flang/Runtime/iostat-consts.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/Math/IR/Math.h" #include <optional> @@ -187,25 +186,6 @@ struct IntrinsicLibrary { mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>); - fir::ExtendedValue genAtomicAddR2(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); - template <int extent> - fir::ExtendedValue genAtomicAddVector(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); - fir::ExtendedValue genAtomicCas(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>); - fir::ExtendedValue genAtomicExch(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>); - fir::ExtendedValue genAtomicXor(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>); @@ -213,11 +193,6 @@ struct IntrinsicLibrary { fir::ExtendedValue genAssociated(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAtand(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genBesselJn(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genBesselYn(mlir::Type, @@ -239,9 +214,6 @@ struct IntrinsicLibrary { fir::ExtendedValue genCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); void genCpuTime(llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCshift(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - template <const char *fctName, int extent> - fir::ExtendedValue genCUDALDXXFunc(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCAssociatedCFunPtr(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCAssociatedCPtr(mlir::Type, @@ -281,7 +253,6 @@ struct IntrinsicLibrary { llvm::ArrayRef<fir::ExtendedValue>); template <Extremum, ExtremumBehavior> mlir::Value genExtremum(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genFraction(mlir::Type resultType, mlir::ArrayRef<mlir::Value> args); @@ -373,8 +344,6 @@ struct IntrinsicLibrary { mlir::Value genMalloc(mlir::Type, llvm::ArrayRef<mlir::Value>); template <typename Shift> mlir::Value genMask(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genMatmul(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genMatmulTranspose(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); @@ -397,8 +366,6 @@ struct IntrinsicLibrary { fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genNumImages(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - template <typename OpTy> - mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); void genPerror(llvm::ArrayRef<fir::ExtendedValue>); @@ -453,56 +420,25 @@ struct IntrinsicLibrary { fir::ExtendedValue genSum(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); void genSignalSubroutine(llvm::ArrayRef<fir::ExtendedValue>); void genSleep(llvm::ArrayRef<fir::ExtendedValue>); - void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genSystem(std::optional<mlir::Type>, mlir::ArrayRef<fir::ExtendedValue> args); void genSystemClock(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genTand(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genTanpi(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genTransfer(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genTranspose(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genThisImage(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>); - void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>); - void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genTrim(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genUbound(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genUnlink(std::optional<mlir::Type> resultType, llvm::ArrayRef<fir::ExtendedValue> args); fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - template <mlir::NVVM::VoteSyncKind kind> - mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>); /// Implement all conversion functions like DBLE, the first argument is /// the value to convert. There may be an additional KIND arguments that diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt index 1f95259a857da..37c9c2d703c76 100644 --- a/flang/lib/Optimizer/Builder/CMakeLists.txt +++ b/flang/lib/Optimizer/Builder/CMakeLists.txt @@ -5,6 +5,7 @@ add_flang_library(FIRBuilder BoxValue.cpp Character.cpp Complex.cpp + CUDAIntrinsicCall.cpp CUFCommon.cpp DoLoopHelper.cpp FIRBuilder.cpp diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp new file mode 100644 index 0000000000000..4e276a72897fe --- /dev/null +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -0,0 +1,1588 @@ +//===-- CUDAIntrinsicCall.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Helper routines for constructing the FIR dialect of MLIR for PowerPC +// intrinsics. Extensive use of MLIR interfaces and MLIR's coding style +// (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this +// module. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h" +#include "flang/Evaluate/common.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/MutableBox.h" +#include "mlir/Dialect/Index/IR/IndexOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" + +namespace fir { + +using CI = CUDAIntrinsicLibrary; + +static const char __ldca_i4x4[] = "__ldca_i4x4_"; +static const char __ldca_i8x2[] = "__ldca_i8x2_"; +static const char __ldca_r2x2[] = "__ldca_r2x2_"; +static const char __ldca_r4x4[] = "__ldca_r4x4_"; +static const char __ldca_r8x2[] = "__ldca_r8x2_"; +static const char __ldcg_i4x4[] = "__ldcg_i4x4_"; +static const char __ldcg_i8x2[] = "__ldcg_i8x2_"; +static const char __ldcg_r2x2[] = "__ldcg_r2x2_"; +static const char __ldcg_r4x4[] = "__ldcg_r4x4_"; +static const char __ldcg_r8x2[] = "__ldcg_r8x2_"; +static const char __ldcs_i4x4[] = "__ldcs_i4x4_"; +static const char __ldcs_i8x2[] = "__ldcs_i8x2_"; +static const char __ldcs_r2x2[] = "__ldcs_r2x2_"; +static const char __ldcs_r4x4[] = "__ldcs_r4x4_"; +static const char __ldcs_r8x2[] = "__ldcs_r8x2_"; +static const char __ldcv_i4x4[] = "__ldcv_i4x4_"; +static const char __ldcv_i8x2[] = "__ldcv_i8x2_"; +static const char __ldcv_r2x2[] = "__ldcv_r2x2_"; +static const char __ldcv_r4x4[] = "__ldcv_r4x4_"; +static const char __ldcv_r8x2[] = "__ldcv_r8x2_"; +static const char __ldlu_i4x4[] = "__ldlu_i4x4_"; +static const char __ldlu_i8x2[] = "__ldlu_i8x2_"; +static const char __ldlu_r2x2[] = "__ldlu_r2x2_"; +static const char __ldlu_r4x4[] = "__ldlu_r4x4_"; +static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; + +// CUDA specific intrinsic handlers. +static constexpr IntrinsicHandler cudaHandlers[]{ + {"__ldca_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"all_sync", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genVoteSync<mlir::NVVM::VoteSyncKind::all>), + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, + {"any_sync", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genVoteSync<mlir::NVVM::VoteSyncKind::any>), + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, + {"atomicadd_r4x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<2>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicadd_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<4>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddr2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicAddR2), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddvector_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<2>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddvector_r4x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<2>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicandi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAnd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomiccasd", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomiccasf", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomiccasi", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomiccasul", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomicdeci", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicDec), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchd", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchf", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchi", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchul", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicinci", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicInc), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmind", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicminf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmini", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicminl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicori", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicOr), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicxori", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicXor), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"ballot_sync", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>), + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, + {"barrier_arrive", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierArrive), + {{{"barrier", asAddr}}}, + /*isElemental=*/false}, + {"barrier_arrive_cnt", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierArriveCnt), + {{{"barrier", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"barrier_init", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genBarrierInit), + {{{"barrier", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"barrier_try_wait", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierTryWait), + {{{"barrier", asAddr}, {"token", asValue}}}, + /*isElemental=*/false}, + {"barrier_try_wait_sleep", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierTryWaitSleep), + {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}}, + /*isElemental=*/false}, + {"clock", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genNVVMTime<mlir::NVVM::ClockOp>), + {}, + /*isElemental=*/false}, + {"clock64", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genNVVMTime<mlir::NVVM::Clock64Op>), + {}, + /*isElemental=*/false}, + {"fence_proxy_async", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genFenceProxyAsync), + {}, + /*isElemental=*/false}, + {"globaltimer", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genNVVMTime<mlir::NVVM::GlobalTimerOp>), + {}, + /*isElemental=*/false}, + {"match_all_syncjd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_all_syncjf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_all_syncjj", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_all_syncjx", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_any_syncjd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"match_any_syncjf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"match_any_syncjj", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"match_any_syncjx", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"syncthreads", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genSyncThreads), + {}, + /*isElemental=*/false}, + {"syncthreads_and_i4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsAnd), + {}, + /*isElemental=*/false}, + {"syncthreads_and_l4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsAnd), + {}, + /*isElemental=*/false}, + {"syncthreads_count_i4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsCount), + {}, + /*isElemental=*/false}, + {"syncthreads_count_l4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsCount), + {}, + /*isElemental=*/false}, + {"syncthreads_or_i4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsOr), + {}, + /*isElemental=*/false}, + {"syncthreads_or_l4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsOr), + {}, + /*isElemental=*/false}, + {"syncwarp", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp), + {}, + /*isElemental=*/false}, + {"this_grid", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid), + {}, + /*isElemental=*/false}, + {"this_thread_block", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genThisThreadBlock), + {}, + /*isElemental=*/false}, + {"this_warp", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisWarp), + {}, + /*isElemental=*/false}, + {"threadfence", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genThreadFence), + {}, + /*isElemental=*/false}, + {"threadfence_block", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genThreadFenceBlock), + {}, + /*isElemental=*/false}, + {"threadfence_system", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genThreadFenceSystem), + {}, + /*isElemental=*/false}, + {"tma_bulk_commit_group", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkCommitGroup), + {{}}, + /*isElemental=*/false}, + {"tma_bulk_g2s", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkG2S), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nbytes", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldc4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadC4), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldc8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadC8), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldi4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadI4), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldi8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadI8), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr2", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadR2), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadR4), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadR8), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_s2g", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkS2G), + {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_c4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreC4), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_c8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreC8), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_i4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreI4), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_i8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreI8), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r2", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreR2), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreR4), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreR8), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_wait_group", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkWaitGroup), + {{}}, + /*isElemental=*/false}, +}; + +template <std::size_t N> +static constexpr bool isSorted(const IntrinsicHandler (&array)[N]) { + // Replace by std::sorted when C++20 is default (will be constexpr). + const IntrinsicHandler *lastSeen{nullptr}; + bool isSorted{true}; + for (const auto &x : array) { + if (lastSeen) + isSorted &= std::string_view{lastSeen->name} < std::string_view{x.name}; + lastSeen = &x; + } + return isSorted; +} +static_assert(isSorted(cudaHandlers) && "map must be sorted"); + +const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name) { + auto compare = [](const IntrinsicHandler &cudaHandler, llvm::StringRef name) { + return name.compare(cudaHandler.name) > 0; + }; + auto result = llvm::lower_bound(cudaHandlers, name, compare); + return result != std::end(cudaHandlers) && result->name == name ? result + : nullptr; +} + +static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value barrier, + mlir::NVVM::NVVMMemorySpace space) { + mlir::Value llvmPtr = fir::ConvertOp::create( + builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), + barrier); + mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( + builder, loc, + mlir::LLVM::LLVMPointerType::get(builder.getContext(), + static_cast<unsigned>(space)), + llvmPtr); + return addrCast; +} + +static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, + mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0, + mlir::Value arg1) { + auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + arg0 = builder.createConvert(loc, llvmPointerType, arg0); + return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1, + mlir::LLVM::AtomicOrdering::seq_cst); +} + +// ATOMICADD +mlir::Value +CUDAIntrinsicLibrary::genAtomicAdd(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::add + : mlir::LLVM::AtomicBinOp::fadd; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + + mlir::Value a = fir::getBase(args[0]); + + if (mlir::isa<fir::BaseBoxType>(a.getType())) { + a = fir::BoxAddrOp::create(builder, loc, a); + } + + auto loc = builder.getUnknownLoc(); + auto f16Ty = builder.getF16Type(); + auto i32Ty = builder.getI32Type(); + auto vecF16Ty = mlir::VectorType::get({2}, f16Ty); + mlir::Type idxTy = builder.getIndexType(); + auto f16RefTy = fir::ReferenceType::get(f16Ty); + auto zero = builder.createIntegerConstant(loc, idxTy, 0); + auto one = builder.createIntegerConstant(loc, idxTy, 1); + auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, + fir::getBase(args[1]), zero); + auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, + fir::getBase(args[1]), one); + auto v1 = fir::LoadOp::create(builder, loc, v1Coord); + auto v2 = fir::LoadOp::create(builder, loc, v2Coord); + mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty); + mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); + mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( + builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); + auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); + auto i32VecTy = mlir::VectorType::get({1}, i32Ty); + mlir::Value vecI32 = + mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res); + return mlir::vector::ExtractOp::create(builder, loc, vecI32, + mlir::ArrayRef<int64_t>{0}); +} + +// ATOMICADDVECTOR +template <int extent> +fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector( + mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value res = fir::AllocaOp::create( + builder, loc, fir::SequenceType::get({extent}, resultType)); + mlir::Value a = fir::getBase(args[0]); + if (mlir::isa<fir::BaseBoxType>(a.getType())) { + a = fir::BoxAddrOp::create(builder, loc, a); + } + auto vecTy = mlir::VectorType::get({extent}, resultType); + auto refTy = fir::ReferenceType::get(resultType); + mlir::Type i32Ty = builder.getI32Type(); + mlir::Type idxTy = builder.getIndexType(); + + // Extract the values from the array. + llvm::SmallVector<mlir::Value> values; + for (unsigned i = 0; i < extent; ++i) { + mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i); + mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), pos); + mlir::Value value = fir::LoadOp::create(builder, loc, coord); + values.push_back(value); + } + // Pack extracted values into a vector to call the atomic add. + mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); + for (unsigned i = 0; i < extent; ++i) { + mlir::Value insert = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, values[i], + builder.createIntegerConstant(loc, i32Ty, i)); + undef = insert; + } + // Atomic operation with a vector of values. + mlir::Value add = + genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef); + // Store results in the result array. + for (unsigned i = 0; i < extent; ++i) { + mlir::Value r = mlir::LLVM::ExtractElementOp::create( + builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i)); + mlir::Value c = fir::CoordinateOp::create( + builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i)); + fir::StoreOp::create(builder, loc, r, c); + } + mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent); + return fir::ArrayBoxValue(res, {ext}); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicOr(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICCAS +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicCas(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel; + auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic; + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext()); + + mlir::Value arg0 = fir::getBase(args[0]); + mlir::Value arg1 = fir::getBase(args[1]); + mlir::Value arg2 = fir::getBase(args[2]); + + auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value { + if (mlir::isa<mlir::Float32Type>(arg.getType())) + return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(), + arg); + if (mlir::isa<mlir::Float64Type>(arg.getType())) + return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(), + arg); + return arg; + }; + + arg1 = bitCastFloat(arg1); + arg2 = bitCastFloat(arg2); + + if (arg1.getType() != arg2.getType()) { + // arg1 and arg2 need to have the same type in AtomicCmpXchgOp. + arg2 = builder.createConvert(loc, arg1.getType(), arg2); + } + + auto address = + mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0) + .getResult(0); + auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create( + builder, loc, address, arg1, arg2, successOrdering, failureOrdering); + mlir::Value boolResult = + mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1); + return builder.createConvert(loc, resultType, boolResult); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicDec(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICEXCH +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicExch(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value arg0 = fir::getBase(args[0]); + mlir::Value arg1 = fir::getBase(args[1]); + assert(arg1.getType().isIntOrFloat()); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg; + return genAtomBinOp(builder, loc, binOp, arg0, arg1); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicInc(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicMax(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::max + : mlir::LLVM::AtomicBinOp::fmax; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicMin(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::min + : mlir::LLVM::AtomicBinOp::fmin; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICSUB +mlir::Value +CUDAIntrinsicLibrary::genAtomicSub(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::sub + : mlir::LLVM::AtomicBinOp::fsub; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICXOR +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicXor(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value arg0 = fir::getBase(args[0]); + mlir::Value arg1 = fir::getBase(args[1]); + return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1); +} + +// BARRIER_ARRIVE +mlir::Value +CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 1); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); + return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType, + barrier) + .getResult(); +} + +// BARRIER_ARRIBVE_CNT +mlir::Value +CUDAIntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); + return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType}, + {barrier, args[1]}, {}, + "mbarrier.arrive.expect_tx.release." + "cta.shared::cta.b64 %0, [%1], %2;", + {}) + .getResult(0); +} + +// BARRIER_INIT +void CUDAIntrinsicLibrary::genBarrierInit( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); + mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier, + fir::getBase(args[1]), {}); + auto kind = mlir::NVVM::ProxyKindAttr::get( + builder.getContext(), mlir::NVVM::ProxyKind::async_shared); + auto space = mlir::NVVM::SharedSpaceAttr::get( + builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); + mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); +} + +// BARRIER_TRY_WAIT +mlir::Value +CUDAIntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0); + fir::StoreOp::create(builder, loc, zero, res); + mlir::Value ns = + builder.createIntegerConstant(loc, builder.getI32Type(), 1000000); + mlir::Value load = fir::LoadOp::create(builder, loc, res); + auto whileOp = mlir::scf::WhileOp::create( + builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load}); + mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore()); + mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc); + builder.setInsertionPointToStart(beforeBlock); + mlir::Value condition = mlir::arith::CmpIOp::create( + builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero); + mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg); + mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter()); + afterBlock->addArgument(resultType, loc); + builder.setInsertionPointToStart(afterBlock); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); + mlir::Value ret = mlir::NVVM::InlinePtxOp::create( + builder, loc, {resultType}, {barrier, args[1], ns}, {}, + "{\n" + " .reg .pred p;\n" + " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" + " selp.b32 %0, 1, 0, p;\n" + "}", + {}) + .getResult(0); + mlir::scf::YieldOp::create(builder, loc, ret); + builder.setInsertionPointAfter(whileOp); + return whileOp.getResult(0); +} + +// BARRIER_TRY_WAIT_SLEEP +mlir::Value +CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 3); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); + return mlir::NVVM::InlinePtxOp::create( + builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, + "{\n" + " .reg .pred p;\n" + " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" + " selp.b32 %0, 1, 0, p;\n" + "}", + {}) + .getResult(0); +} + +// FENCE_PROXY_ASYNC +void CUDAIntrinsicLibrary::genFenceProxyAsync( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + auto kind = mlir::NVVM::ProxyKindAttr::get( + builder.getContext(), mlir::NVVM::ProxyKind::async_shared); + auto space = mlir::NVVM::SharedSpaceAttr::get( + builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); + mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); +} + +// __LDCA, __LDCS, __LDLU, __LDCV +template <const char *fctName, int extent> +fir::ExtendedValue +CUDAIntrinsicLibrary::genLDXXFunc(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 1); + mlir::Type resTy = fir::SequenceType::get(extent, resultType); + mlir::Value arg = fir::getBase(args[0]); + mlir::Value res = fir::AllocaOp::create(builder, loc, resTy); + if (mlir::isa<fir::BaseBoxType>(arg.getType())) + arg = fir::BoxAddrOp::create(builder, loc, arg); + mlir::Type refResTy = fir::ReferenceType::get(resTy); + mlir::FunctionType ftype = + mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {}); + auto funcOp = builder.createFunction(loc, fctName, ftype); + llvm::SmallVector<mlir::Value> funcArgs; + funcArgs.push_back(res); + funcArgs.push_back(arg); + fir::CallOp::create(builder, loc, funcOp, funcArgs); + mlir::Value ext = + builder.createIntegerConstant(loc, builder.getIndexType(), extent); + return fir::ArrayBoxValue(res, {ext}); +} + +// CLOCK, CLOCK64, GLOBALTIMER +template <typename OpTy> +mlir::Value +CUDAIntrinsicLibrary::genNVVMTime(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0 && "expect no arguments"); + return OpTy::create(builder, loc, resultType).getResult(); +} + +// MATCH_ALL_SYNC +mlir::Value +CUDAIntrinsicLibrary::genMatchAllSync(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 3); + bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); + + mlir::Type i1Ty = builder.getI1Type(); + mlir::MLIRContext *context = builder.getContext(); + + mlir::Value arg1 = args[1]; + if (arg1.getType().isF32() || arg1.getType().isF64()) + arg1 = fir::ConvertOp::create( + builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); + + mlir::Type retTy = + mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty}); + auto match = + mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1, + mlir::NVVM::MatchSyncKind::all) + .getResult(); + auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0); + auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1); + auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred); + fir::StoreOp::create(builder, loc, conv, args[2]); + return value; +} + +// MATCH_ANY_SYNC +mlir::Value +CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); + + mlir::Value arg1 = args[1]; + if (arg1.getType().isF32() || arg1.getType().isF64()) + arg1 = fir::ConvertOp::create( + builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); + + return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0], + arg1, mlir::NVVM::MatchSyncKind::any) + .getResult(); +} + +// SYNCTHREADS +void CUDAIntrinsicLibrary::genSyncThreads( + llvm::ArrayRef<fir::ExtendedValue> args) { + mlir::NVVM::Barrier0Op::create(builder, loc); +} + +// SYNCTHREADS_AND +mlir::Value +CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and"; + mlir::MLIRContext *context = builder.getContext(); + mlir::Type i32 = builder.getI32Type(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {i32}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + mlir::Value arg = builder.createConvert(loc, i32, args[0]); + return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); +} + +// SYNCTHREADS_COUNT +mlir::Value +CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc"; + mlir::MLIRContext *context = builder.getContext(); + mlir::Type i32 = builder.getI32Type(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {i32}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + mlir::Value arg = builder.createConvert(loc, i32, args[0]); + return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); +} + +// SYNCTHREADS_OR +mlir::Value +CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or"; + mlir::MLIRContext *context = builder.getContext(); + mlir::Type i32 = builder.getI32Type(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {i32}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + mlir::Value arg = builder.createConvert(loc, i32, args[0]); + return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); +} + +// SYNCWARP +void CUDAIntrinsicLibrary::genSyncWarp( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 1); + constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync"; + mlir::Value mask = fir::getBase(args[0]); + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector<mlir::Value> argsList{mask}; + fir::CallOp::create(builder, loc, funcOp, argsList); +} + +// THIS_GRID +mlir::Value +CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0); + auto recTy = mlir::cast<fir::RecordType>(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Type i32Ty = builder.getI32Type(); + + mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); + mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); + mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); + + mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty); + mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty); + mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty); + + mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); + mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); + mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); + mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty); + mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty); + mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty); + + // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) * + // (blockDim.x * gridDim.x); + mlir::Value resZ = + mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ); + mlir::Value resY = + mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY); + mlir::Value resX = + mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX); + mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY); + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX); + + // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) + + // blockIdx.x; + // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) + + // ((threadIdx.z * blockDim.y) * blockDim.x) + + // (threadIdx.y * blockDim.x) + threadIdx.x + 1; + mlir::Value r1 = + mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY); + mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX); + mlir::Value r3 = + mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX); + mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); + mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX); + + mlir::Value bXbY = + mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY); + mlir::Value bXbYbZ = + mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ); + mlir::Value tZbY = + mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); + mlir::Value tZbYbX = + mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX); + mlir::Value tYbX = + mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); + mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ); + rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX); + rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX); + rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX); + mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); + rank = mlir::arith::AddIOp::create(builder, loc, rank, one); + + auto sizeFieldName = recTy.getTypeList()[1].first; + mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, sizeFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value sizeCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); + fir::StoreOp::create(builder, loc, size, sizeCoord); + + auto rankFieldName = recTy.getTypeList()[2].first; + mlir::Type rankFieldTy = recTy.getTypeList()[2].second; + mlir::Value rankFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, rankFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value rankCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); + fir::StoreOp::create(builder, loc, rank, rankCoord); + return res; +} + +// THIS_THREAD_BLOCK +mlir::Value +CUDAIntrinsicLibrary::genThisThreadBlock(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0); + auto recTy = mlir::cast<fir::RecordType>(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Type i32Ty = builder.getI32Type(); + + // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x; + mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); + mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); + mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); + mlir::Value size = + mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY); + size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX); + + // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) + + // (threadIdx.y * blockDim.x) + threadIdx.x + 1; + mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); + mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); + mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); + mlir::Value r1 = + mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); + mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX); + mlir::Value r3 = + mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); + mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); + mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX); + mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); + rank = mlir::arith::AddIOp::create(builder, loc, rank, one); + + auto sizeFieldName = recTy.getTypeList()[1].first; + mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, sizeFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value sizeCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); + fir::StoreOp::create(builder, loc, size, sizeCoord); + + auto rankFieldName = recTy.getTypeList()[2].first; + mlir::Type rankFieldTy = recTy.getTypeList()[2].second; + mlir::Value rankFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, rankFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value rankCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); + fir::StoreOp::create(builder, loc, rank, rankCoord); + return res; +} + +// THIS_WARP +mlir::Value +CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0); + auto recTy = mlir::cast<fir::RecordType>(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Type i32Ty = builder.getI32Type(); + + // coalesced_group%size = 32 + mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32); + auto sizeFieldName = recTy.getTypeList()[1].first; + mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, sizeFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value sizeCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); + fir::StoreOp::create(builder, loc, size, sizeCoord); + + // coalesced_group%rank = threadIdx.x & 31 + 1 + mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); + mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31); + mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); + mlir::Value masked = + mlir::arith::AndIOp::create(builder, loc, threadIdX, mask); + mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one); + auto rankFieldName = recTy.getTypeList()[2].first; + mlir::Type rankFieldTy = recTy.getTypeList()[2].second; + mlir::Value rankFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, rankFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value rankCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); + fir::StoreOp::create(builder, loc, rank, rankCoord); + return res; +} + +// THREADFENCE +void CUDAIntrinsicLibrary::genThreadFence( + llvm::ArrayRef<fir::ExtendedValue> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl"; + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector<mlir::Value> noArgs; + fir::CallOp::create(builder, loc, funcOp, noArgs); +} + +// THREADFENCE_BLOCK +void CUDAIntrinsicLibrary::genThreadFenceBlock( + llvm::ArrayRef<fir::ExtendedValue> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta"; + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector<mlir::Value> noArgs; + fir::CallOp::create(builder, loc, funcOp, noArgs); +} + +// THREADFENCE_SYSTEM +void CUDAIntrinsicLibrary::genThreadFenceSystem( + llvm::ArrayRef<fir::ExtendedValue> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys"; + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector<mlir::Value> noArgs; + fir::CallOp::create(builder, loc, funcOp, noArgs); +} + +// TMA_BULK_COMMIT_GROUP +void CUDAIntrinsicLibrary::genTMABulkCommitGroup( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); +} + +// TMA_BULK_G2S +void CUDAIntrinsicLibrary::genTMABulkG2S( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); + mlir::Value dst = + convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]), + mlir::NVVM::NVVMMemorySpace::SharedCluster); + mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create( + builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); +} + +static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value barrier, mlir::Value src, + mlir::Value dst, mlir::Value nelem, + mlir::Value eleSize) { + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + barrier = builder.createConvert(loc, llvmPtrTy, barrier); + dst = builder.createConvert(loc, llvmPtrTy, dst); + src = builder.createConvert(loc, llvmPtrTy, src); + mlir::NVVM::InlinePtxOp::create( + builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, + "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " + "[%1], %2, [%3];", + {}); + mlir::NVVM::InlinePtxOp::create( + builder, loc, mlir::TypeRange{}, {barrier, size}, {}, + "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {}); +} + +// TMA_BULK_LOADC4 +void CUDAIntrinsicLibrary::genTMABulkLoadC4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADC8 +void CUDAIntrinsicLibrary::genTMABulkLoadC8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 16); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADI4 +void CUDAIntrinsicLibrary::genTMABulkLoadI4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADI8 +void CUDAIntrinsicLibrary::genTMABulkLoadI8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR2 +void CUDAIntrinsicLibrary::genTMABulkLoadR2( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 2); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR4 +void CUDAIntrinsicLibrary::genTMABulkLoadR4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR8 +void CUDAIntrinsicLibrary::genTMABulkLoadR8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_S2G +void CUDAIntrinsicLibrary::genTMABulkS2G( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]), + mlir::NVVM::NVVMMemorySpace::Shared); + mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( + builder, loc, dst, src, fir::getBase(args[2]), {}, {}); + + mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, + "cp.async.bulk.commit_group;", {}); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, + builder.getI32IntegerAttr(0), {}); +} + +static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value src, mlir::Value dst, mlir::Value count, + mlir::Value eleSize) { + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count); + src = convertPtrToNVVMSpace(builder, loc, src, + mlir::NVVM::NVVMMemorySpace::Shared); + dst = convertPtrToNVVMSpace(builder, loc, dst, + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src, + size, {}, {}); + mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, + "cp.async.bulk.commit_group;", {}); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, + builder.getI32IntegerAttr(0), {}); +} + +// TMA_BULK_STORE_C4 +void CUDAIntrinsicLibrary::genTMABulkStoreC4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_C8 +void CUDAIntrinsicLibrary::genTMABulkStoreC8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 16); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_I4 +void CUDAIntrinsicLibrary::genTMABulkStoreI4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_I8 +void CUDAIntrinsicLibrary::genTMABulkStoreI8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R2 +void CUDAIntrinsicLibrary::genTMABulkStoreR2( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 2); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R4 +void CUDAIntrinsicLibrary::genTMABulkStoreR4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R8 +void CUDAIntrinsicLibrary::genTMABulkStoreR8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_WAIT_GROUP +void CUDAIntrinsicLibrary::genTMABulkWaitGroup( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + auto group = builder.getIntegerAttr(builder.getI32Type(), 0); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {}); +} + +// ALL_SYNC, ANY_SYNC, BALLOT_SYNC +template <mlir::NVVM::VoteSyncKind kind> +mlir::Value +CUDAIntrinsicLibrary::genVoteSync(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::Value arg1 = + fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]); + mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot + ? builder.getI32Type() + : builder.getI1Type(); + auto voteRes = + mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind) + .getResult(); + return fir::ConvertOp::create(builder, loc, resultType, voteRes); +} + +} // namespace fir diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 3156c8cb4332c..3eb60448fae38 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -16,6 +16,7 @@ #include "flang/Optimizer/Builder/IntrinsicCall.h" #include "flang/Common/static-multimap-view.h" #include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h" #include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/Character.h" #include "flang/Optimizer/Builder/Complex.h" @@ -50,7 +51,6 @@ #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/Math/IR/Math.h" -#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -108,34 +108,6 @@ using I = IntrinsicLibrary; /// argument is an optional variable in the current scope). static constexpr bool handleDynamicOptional = true; -/// TODO: Move all CUDA Fortran intrinsic handlers into its own file similar to -/// PPC. -static const char __ldca_i4x4[] = "__ldca_i4x4_"; -static const char __ldca_i8x2[] = "__ldca_i8x2_"; -static const char __ldca_r2x2[] = "__ldca_r2x2_"; -static const char __ldca_r4x4[] = "__ldca_r4x4_"; -static const char __ldca_r8x2[] = "__ldca_r8x2_"; -static const char __ldcg_i4x4[] = "__ldcg_i4x4_"; -static const char __ldcg_i8x2[] = "__ldcg_i8x2_"; -static const char __ldcg_r2x2[] = "__ldcg_r2x2_"; -static const char __ldcg_r4x4[] = "__ldcg_r4x4_"; -static const char __ldcg_r8x2[] = "__ldcg_r8x2_"; -static const char __ldcs_i4x4[] = "__ldcs_i4x4_"; -static const char __ldcs_i8x2[] = "__ldcs_i8x2_"; -static const char __ldcs_r2x2[] = "__ldcs_r2x2_"; -static const char __ldcs_r4x4[] = "__ldcs_r4x4_"; -static const char __ldcs_r8x2[] = "__ldcs_r8x2_"; -static const char __ldcv_i4x4[] = "__ldcv_i4x4_"; -static const char __ldcv_i8x2[] = "__ldcv_i8x2_"; -static const char __ldcv_r2x2[] = "__ldcv_r2x2_"; -static const char __ldcv_r4x4[] = "__ldcv_r4x4_"; -static const char __ldcv_r8x2[] = "__ldcv_r8x2_"; -static const char __ldlu_i4x4[] = "__ldlu_i4x4_"; -static const char __ldlu_i8x2[] = "__ldlu_i8x2_"; -static const char __ldlu_r2x2[] = "__ldlu_r2x2_"; -static const char __ldlu_r4x4[] = "__ldlu_r4x4_"; -static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; - /// Table that drives the fir generation depending on the intrinsic or intrinsic /// module procedure one to one mapping with Fortran arguments. If no mapping is /// defined here for a generic intrinsic, genRuntimeCall will be called @@ -144,106 +116,6 @@ static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; /// argument must not be lowered by value. In which case, the lowering rules /// should be provided for all the intrinsic arguments for completeness. static constexpr IntrinsicHandler handlers[]{ - {"__ldca_i4x4", - &I::genCUDALDXXFunc<__ldca_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_i8x2", - &I::genCUDALDXXFunc<__ldca_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_r2x2", - &I::genCUDALDXXFunc<__ldca_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_r4x4", - &I::genCUDALDXXFunc<__ldca_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_r8x2", - &I::genCUDALDXXFunc<__ldca_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_i4x4", - &I::genCUDALDXXFunc<__ldcg_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_i8x2", - &I::genCUDALDXXFunc<__ldcg_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_r2x2", - &I::genCUDALDXXFunc<__ldcg_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_r4x4", - &I::genCUDALDXXFunc<__ldcg_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_r8x2", - &I::genCUDALDXXFunc<__ldcg_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_i4x4", - &I::genCUDALDXXFunc<__ldcs_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_i8x2", - &I::genCUDALDXXFunc<__ldcs_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_r2x2", - &I::genCUDALDXXFunc<__ldcs_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_r4x4", - &I::genCUDALDXXFunc<__ldcs_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_r8x2", - &I::genCUDALDXXFunc<__ldcs_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_i4x4", - &I::genCUDALDXXFunc<__ldcv_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_i8x2", - &I::genCUDALDXXFunc<__ldcv_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_r2x2", - &I::genCUDALDXXFunc<__ldcv_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_r4x4", - &I::genCUDALDXXFunc<__ldcv_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_r8x2", - &I::genCUDALDXXFunc<__ldcv_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_i4x4", - &I::genCUDALDXXFunc<__ldlu_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_i8x2", - &I::genCUDALDXXFunc<__ldlu_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_r2x2", - &I::genCUDALDXXFunc<__ldlu_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_r4x4", - &I::genCUDALDXXFunc<__ldlu_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_r8x2", - &I::genCUDALDXXFunc<__ldlu_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, {"abort", &I::genAbort}, {"abs", &I::genAbs}, {"achar", &I::genChar}, @@ -263,10 +135,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genAll, {{{"mask", asAddr}, {"dim", asValue}}}, /*isElemental=*/false}, - {"all_sync", - &I::genVoteSync<mlir::NVVM::VoteSyncKind::all>, - {{{"mask", asValue}, {"pred", asValue}}}, - /*isElemental=*/false}, {"allocated", &I::genAllocated, {{{"array", asInquired}, {"scalar", asInquired}}}, @@ -276,10 +144,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genAny, {{{"mask", asAddr}, {"dim", asValue}}}, /*isElemental=*/false}, - {"any_sync", - &I::genVoteSync<mlir::NVVM::VoteSyncKind::any>, - {{{"mask", asValue}, {"pred", asValue}}}, - /*isElemental=*/false}, {"asind", &I::genAsind}, {"asinpi", &I::genAsinpi}, {"associated", @@ -290,103 +154,6 @@ static constexpr IntrinsicHandler handlers[]{ {"atan2pi", &I::genAtanpi}, {"atand", &I::genAtand}, {"atanpi", &I::genAtanpi}, - {"atomicadd_r4x2", - &I::genAtomicAddVector<2>, - {{{"a", asAddr}, {"v", asAddr}}}, - false}, - {"atomicadd_r4x4", - &I::genAtomicAddVector<4>, - {{{"a", asAddr}, {"v", asAddr}}}, - false}, - {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicaddr2", - &I::genAtomicAddR2, - {{{"a", asAddr}, {"v", asAddr}}}, - false}, - {"atomicaddvector_r2x2", - &I::genAtomicAddVector<2>, - {{{"a", asAddr}, {"v", asAddr}}}, - false}, - {"atomicaddvector_r4x2", - &I::genAtomicAddVector<2>, - {{{"a", asAddr}, {"v", asAddr}}}, - false}, - {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomiccasd", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomiccasf", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomiccasi", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomiccasul", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomicdeci", &I::genAtomicDec, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicexchd", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicexchf", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicexchi", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicexchul", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicinci", &I::genAtomicInc, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxd", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxf", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxi", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxl", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmind", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicminf", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmini", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicminl", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicori", &I::genAtomicOr, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubd", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubf", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubi", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubl", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicxori", &I::genAtomicXor, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"ballot_sync", - &I::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>, - {{{"mask", asValue}, {"pred", asValue}}}, - /*isElemental=*/false}, - {"barrier_arrive", - &I::genBarrierArrive, - {{{"barrier", asAddr}}}, - /*isElemental=*/false}, - {"barrier_arrive_cnt", - &I::genBarrierArriveCnt, - {{{"barrier", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"barrier_init", - &I::genBarrierInit, - {{{"barrier", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"barrier_try_wait", - &I::genBarrierTryWait, - {{{"barrier", asAddr}, {"token", asValue}}}, - /*isElemental=*/false}, - {"barrier_try_wait_sleep", - &I::genBarrierTryWaitSleep, - {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}}, - /*isElemental=*/false}, {"bessel_jn", &I::genBesselJn, {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}}, @@ -430,11 +197,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genChdir, {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}}, /*isElemental=*/false}, - {"clock", &I::genNVVMTime<mlir::NVVM::ClockOp>, {}, /*isElemental=*/false}, - {"clock64", - &I::genNVVMTime<mlir::NVVM::Clock64Op>, - {}, - /*isElemental=*/false}, {"cmplx", &I::genCmplx, {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}}, @@ -531,10 +293,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genExtendsTypeOf, {{{"a", asBox}, {"mold", asBox}}}, /*isElemental=*/false}, - {"fence_proxy_async", - &I::genFenceProxyAsync, - {}, - /*isElemental=*/false}, {"findloc", &I::genFindloc, {{{"array", asBox}, @@ -589,10 +347,6 @@ static constexpr IntrinsicHandler handlers[]{ {"getgid", &I::genGetGID}, {"getpid", &I::genGetPID}, {"getuid", &I::genGetUID}, - {"globaltimer", - &I::genNVVMTime<mlir::NVVM::GlobalTimerOp>, - {}, - /*isElemental=*/false}, {"hostnm", &I::genHostnm, {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}}, @@ -760,38 +514,6 @@ static constexpr IntrinsicHandler handlers[]{ {"malloc", &I::genMalloc}, {"maskl", &I::genMask<mlir::arith::ShLIOp>}, {"maskr", &I::genMask<mlir::arith::ShRUIOp>}, - {"match_all_syncjd", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_all_syncjf", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_all_syncjj", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_all_syncjx", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_any_syncjd", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, - {"match_any_syncjf", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, - {"match_any_syncjj", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, - {"match_any_syncjx", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, {"matmul", &I::genMatmul, {{{"matrix_a", asAddr}, {"matrix_b", asAddr}}}, @@ -1017,20 +739,6 @@ static constexpr IntrinsicHandler handlers[]{ {"dim", asValue}, {"mask", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, - {"syncthreads", &I::genSyncThreads, {}, /*isElemental=*/false}, - {"syncthreads_and_i4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false}, - {"syncthreads_and_l4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false}, - {"syncthreads_count_i4", - &I::genSyncThreadsCount, - {}, - /*isElemental=*/false}, - {"syncthreads_count_l4", - &I::genSyncThreadsCount, - {}, - /*isElemental=*/false}, - {"syncthreads_or_i4", &I::genSyncThreadsOr, {}, /*isElemental=*/false}, - {"syncthreads_or_l4", &I::genSyncThreadsOr, {}, /*isElemental=*/false}, - {"syncwarp", &I::genSyncWarp, {}, /*isElemental=*/false}, {"system", &I::genSystem, {{{"command", asBox}, {"exitstat", asBox, handleDynamicOptional}}}, @@ -1041,115 +749,13 @@ static constexpr IntrinsicHandler handlers[]{ /*isElemental=*/false}, {"tand", &I::genTand}, {"tanpi", &I::genTanpi}, - {"this_grid", &I::genThisGrid, {}, /*isElemental=*/false}, {"this_image", &I::genThisImage, {{{"coarray", asBox}, {"dim", asAddr}, {"team", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, - {"this_thread_block", &I::genThisThreadBlock, {}, /*isElemental=*/false}, - {"this_warp", &I::genThisWarp, {}, /*isElemental=*/false}, - {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false}, - {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false}, - {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false}, {"time", &I::genTime, {}, /*isElemental=*/false}, - {"tma_bulk_commit_group", - &I::genTMABulkCommitGroup, - {{}}, - /*isElemental=*/false}, - {"tma_bulk_g2s", - &I::genTMABulkG2S, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nbytes", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldc4", - &I::genTMABulkLoadC4, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldc8", - &I::genTMABulkLoadC8, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldi4", - &I::genTMABulkLoadI4, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldi8", - &I::genTMABulkLoadI8, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldr2", - &I::genTMABulkLoadR2, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldr4", - &I::genTMABulkLoadR4, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldr8", - &I::genTMABulkLoadR8, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_s2g", - &I::genTMABulkS2G, - {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_c4", - &I::genTMABulkStoreC4, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_c8", - &I::genTMABulkStoreC8, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_i4", - &I::genTMABulkStoreI4, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_i8", - &I::genTMABulkStoreI8, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_r2", - &I::genTMABulkStoreR2, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_r4", - &I::genTMABulkStoreR4, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_r8", - &I::genTMABulkStoreR8, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_wait_group", - &I::genTMABulkWaitGroup, - {{}}, - /*isElemental=*/false}, {"trailz", &I::genTrailz}, {"transfer", &I::genTransfer, @@ -2241,6 +1847,9 @@ lookupIntrinsicHandler(fir::FirOpBuilder &builder, if (isPPCTarget) if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name)) return std::make_optional<IntrinsicHandlerEntry>(ppcHandler); + // TODO: Look for CUDA intrinsic handlers only if CUDA is enabled. + if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name)) + return std::make_optional<IntrinsicHandlerEntry>(cudaHandler); // Subroutines should have a handler. if (!resultType) return std::nullopt; @@ -3127,244 +2736,6 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType, return mlir::arith::MulFOp::create(builder, loc, atan, factor); } -static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, - mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0, - mlir::Value arg1) { - auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - arg0 = builder.createConvert(loc, llvmPointerType, arg0); - return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1, - mlir::LLVM::AtomicOrdering::seq_cst); -} - -mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::add - : mlir::LLVM::AtomicBinOp::fadd; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -fir::ExtendedValue -IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - - mlir::Value a = fir::getBase(args[0]); - - if (mlir::isa<fir::BaseBoxType>(a.getType())) { - a = fir::BoxAddrOp::create(builder, loc, a); - } - - auto loc = builder.getUnknownLoc(); - auto f16Ty = builder.getF16Type(); - auto i32Ty = builder.getI32Type(); - auto vecF16Ty = mlir::VectorType::get({2}, f16Ty); - mlir::Type idxTy = builder.getIndexType(); - auto f16RefTy = fir::ReferenceType::get(f16Ty); - auto zero = builder.createIntegerConstant(loc, idxTy, 0); - auto one = builder.createIntegerConstant(loc, idxTy, 1); - auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, - fir::getBase(args[1]), zero); - auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, - fir::getBase(args[1]), one); - auto v1 = fir::LoadOp::create(builder, loc, v1Coord); - auto v2 = fir::LoadOp::create(builder, loc, v2Coord); - mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty); - mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( - builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); - mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( - builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); - auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); - auto i32VecTy = mlir::VectorType::get({1}, i32Ty); - mlir::Value vecI32 = - mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res); - return mlir::vector::ExtractOp::create(builder, loc, vecI32, - mlir::ArrayRef<int64_t>{0}); -} - -template <int extent> -fir::ExtendedValue -IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - mlir::Value res = fir::AllocaOp::create( - builder, loc, fir::SequenceType::get({extent}, resultType)); - mlir::Value a = fir::getBase(args[0]); - if (mlir::isa<fir::BaseBoxType>(a.getType())) { - a = fir::BoxAddrOp::create(builder, loc, a); - } - auto vecTy = mlir::VectorType::get({extent}, resultType); - auto refTy = fir::ReferenceType::get(resultType); - mlir::Type i32Ty = builder.getI32Type(); - mlir::Type idxTy = builder.getIndexType(); - - // Extract the values from the array. - llvm::SmallVector<mlir::Value> values; - for (unsigned i = 0; i < extent; ++i) { - mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i); - mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy, - fir::getBase(args[1]), pos); - mlir::Value value = fir::LoadOp::create(builder, loc, coord); - values.push_back(value); - } - // Pack extracted values into a vector to call the atomic add. - mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); - for (unsigned i = 0; i < extent; ++i) { - mlir::Value insert = mlir::LLVM::InsertElementOp::create( - builder, loc, undef, values[i], - builder.createIntegerConstant(loc, i32Ty, i)); - undef = insert; - } - // Atomic operation with a vector of values. - mlir::Value add = - genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef); - // Store results in the result array. - for (unsigned i = 0; i < extent; ++i) { - mlir::Value r = mlir::LLVM::ExtractElementOp::create( - builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i)); - mlir::Value c = fir::CoordinateOp::create( - builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i)); - fir::StoreOp::create(builder, loc, r, c); - } - mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent); - return fir::ArrayBoxValue(res, {ext}); -} - -mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::sub - : mlir::LLVM::AtomicBinOp::fsub; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicAnd(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicOr(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -// ATOMICCAS -fir::ExtendedValue -IntrinsicLibrary::genAtomicCas(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel; - auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic; - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext()); - - mlir::Value arg0 = fir::getBase(args[0]); - mlir::Value arg1 = fir::getBase(args[1]); - mlir::Value arg2 = fir::getBase(args[2]); - - auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value { - if (mlir::isa<mlir::Float32Type>(arg.getType())) - return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(), - arg); - if (mlir::isa<mlir::Float64Type>(arg.getType())) - return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(), - arg); - return arg; - }; - - arg1 = bitCastFloat(arg1); - arg2 = bitCastFloat(arg2); - - if (arg1.getType() != arg2.getType()) { - // arg1 and arg2 need to have the same type in AtomicCmpXchgOp. - arg2 = builder.createConvert(loc, arg1.getType(), arg2); - } - - auto address = - mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0) - .getResult(0); - auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create( - builder, loc, address, arg1, arg2, successOrdering, failureOrdering); - mlir::Value boolResult = - mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1); - return builder.createConvert(loc, resultType, boolResult); -} - -mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -// ATOMICEXCH -fir::ExtendedValue -IntrinsicLibrary::genAtomicExch(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - mlir::Value arg0 = fir::getBase(args[0]); - mlir::Value arg1 = fir::getBase(args[1]); - assert(arg1.getType().isIntOrFloat()); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg; - return genAtomBinOp(builder, loc, binOp, arg0, arg1); -} - -mlir::Value IntrinsicLibrary::genAtomicInc(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicMax(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::max - : mlir::LLVM::AtomicBinOp::fmax; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicMin(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::min - : mlir::LLVM::AtomicBinOp::fmin; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -// ATOMICXOR -fir::ExtendedValue -IntrinsicLibrary::genAtomicXor(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - mlir::Value arg0 = fir::getBase(args[0]); - mlir::Value arg1 = fir::getBase(args[1]); - return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1); -} - // ASSOCIATED fir::ExtendedValue IntrinsicLibrary::genAssociated(mlir::Type resultType, @@ -3416,118 +2787,6 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType, return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox); } -static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder, - mlir::Location loc, - mlir::Value barrier, - mlir::NVVM::NVVMMemorySpace space) { - mlir::Value llvmPtr = fir::ConvertOp::create( - builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), - barrier); - mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( - builder, loc, - mlir::LLVM::LLVMPointerType::get(builder.getContext(), - static_cast<unsigned>(space)), - llvmPtr); - return addrCast; -} - -// BARRIER_ARRIVE (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierArrive(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 1); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); - return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType, - barrier) - .getResult(); -} - -// BARRIER_ARRIBVE_CNT (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); - return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType}, - {barrier, args[1]}, {}, - "mbarrier.arrive.expect_tx.release." - "cta.shared::cta.b64 %0, [%1], %2;", - {}) - .getResult(0); -} - -// BARRIER_INIT (CUDA) -void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); - mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier, - fir::getBase(args[1]), {}); - auto kind = mlir::NVVM::ProxyKindAttr::get( - builder.getContext(), mlir::NVVM::ProxyKind::async_shared); - auto space = mlir::NVVM::SharedSpaceAttr::get( - builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); - mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); -} - -// BARRIER_TRY_WAIT (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0); - fir::StoreOp::create(builder, loc, zero, res); - mlir::Value ns = - builder.createIntegerConstant(loc, builder.getI32Type(), 1000000); - mlir::Value load = fir::LoadOp::create(builder, loc, res); - auto whileOp = mlir::scf::WhileOp::create( - builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load}); - mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore()); - mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc); - builder.setInsertionPointToStart(beforeBlock); - mlir::Value condition = mlir::arith::CmpIOp::create( - builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero); - mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg); - mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter()); - afterBlock->addArgument(resultType, loc); - builder.setInsertionPointToStart(afterBlock); - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); - mlir::Value ret = mlir::NVVM::InlinePtxOp::create( - builder, loc, {resultType}, {barrier, args[1], ns}, {}, - "{\n" - " .reg .pred p;\n" - " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" - " selp.b32 %0, 1, 0, p;\n" - "}", - {}) - .getResult(0); - mlir::scf::YieldOp::create(builder, loc, ret); - builder.setInsertionPointAfter(whileOp); - return whileOp.getResult(0); -} - -// BARRIER_TRY_WAIT_SLEEP (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 3); - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); - return mlir::NVVM::InlinePtxOp::create( - builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, - "{\n" - " .reg .pred p;\n" - " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" - " selp.b32 %0, 1, 0, p;\n" - "}", - {}) - .getResult(0); -} - // BESSEL_JN fir::ExtendedValue IntrinsicLibrary::genBesselJn(mlir::Type resultType, @@ -4261,30 +3520,6 @@ IntrinsicLibrary::genCshift(mlir::Type resultType, return readAndAddCleanUp(resultMutableBox, resultType, "CSHIFT"); } -// __LDCA, __LDCS, __LDLU, __LDCV -template <const char *fctName, int extent> -fir::ExtendedValue -IntrinsicLibrary::genCUDALDXXFunc(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 1); - mlir::Type resTy = fir::SequenceType::get(extent, resultType); - mlir::Value arg = fir::getBase(args[0]); - mlir::Value res = fir::AllocaOp::create(builder, loc, resTy); - if (mlir::isa<fir::BaseBoxType>(arg.getType())) - arg = fir::BoxAddrOp::create(builder, loc, arg); - mlir::Type refResTy = fir::ReferenceType::get(resTy); - mlir::FunctionType ftype = - mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {}); - auto funcOp = builder.createFunction(loc, fctName, ftype); - llvm::SmallVector<mlir::Value> funcArgs; - funcArgs.push_back(res); - funcArgs.push_back(arg); - fir::CallOp::create(builder, loc, funcOp, funcArgs); - mlir::Value ext = - builder.createIntegerConstant(loc, builder.getIndexType(), extent); - return fir::ArrayBoxValue(res, {ext}); -} - // DATE_AND_TIME void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) { assert(args.size() == 4 && "date_and_time has 4 args"); @@ -4617,17 +3852,6 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType, fir::getBase(args[1]))); } -// FENCE_PROXY_ASYNC (CUDA) -void IntrinsicLibrary::genFenceProxyAsync( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 0); - auto kind = mlir::NVVM::ProxyKindAttr::get( - builder.getContext(), mlir::NVVM::ProxyKind::async_shared); - auto space = mlir::NVVM::SharedSpaceAttr::get( - builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); - mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); -} - // FINDLOC fir::ExtendedValue IntrinsicLibrary::genFindloc(mlir::Type resultType, @@ -7138,67 +6362,6 @@ mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType, return result; } -// MATCH_ALL_SYNC -mlir::Value -IntrinsicLibrary::genMatchAllSync(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 3); - bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); - - mlir::Type i1Ty = builder.getI1Type(); - mlir::MLIRContext *context = builder.getContext(); - - mlir::Value arg1 = args[1]; - if (arg1.getType().isF32() || arg1.getType().isF64()) - arg1 = fir::ConvertOp::create( - builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); - - mlir::Type retTy = - mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty}); - auto match = - mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1, - mlir::NVVM::MatchSyncKind::all) - .getResult(); - auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0); - auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1); - auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred); - fir::StoreOp::create(builder, loc, conv, args[2]); - return value; -} - -// ALL_SYNC, ANY_SYNC, BALLOT_SYNC -template <mlir::NVVM::VoteSyncKind kind> -mlir::Value IntrinsicLibrary::genVoteSync(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - mlir::Value arg1 = - fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]); - mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot - ? builder.getI32Type() - : builder.getI1Type(); - auto voteRes = - mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind) - .getResult(); - return fir::ConvertOp::create(builder, loc, resultType, voteRes); -} - -// MATCH_ANY_SYNC -mlir::Value -IntrinsicLibrary::genMatchAnySync(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); - - mlir::Value arg1 = args[1]; - if (arg1.getType().isF32() || arg1.getType().isF64()) - arg1 = fir::ConvertOp::create( - builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); - - return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0], - arg1, mlir::NVVM::MatchSyncKind::any) - .getResult(); -} - // MATMUL fir::ExtendedValue IntrinsicLibrary::genMatmul(mlir::Type resultType, @@ -7816,14 +6979,6 @@ IntrinsicLibrary::genNumImages(mlir::Type resultType, return mif::NumImagesOp::create(builder, loc).getResult(); } -// CLOCK, CLOCK64, GLOBALTIMER -template <typename OpTy> -mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0 && "expect no arguments"); - return OpTy::create(builder, loc, resultType).getResult(); -} - // PACK fir::ExtendedValue IntrinsicLibrary::genPack(mlir::Type resultType, @@ -8798,92 +7953,6 @@ mlir::Value IntrinsicLibrary::genTanpi(mlir::Type resultType, return getRuntimeCallGenerator("tan", ftype)(builder, loc, {arg}); } -// THIS_GRID -mlir::Value IntrinsicLibrary::genThisGrid(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0); - auto recTy = mlir::cast<fir::RecordType>(resultType); - assert(recTy && "RecordType expepected"); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Type i32Ty = builder.getI32Type(); - - mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); - mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); - mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); - - mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty); - mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty); - mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty); - - mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); - mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); - mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); - mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty); - mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty); - mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty); - - // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) * - // (blockDim.x * gridDim.x); - mlir::Value resZ = - mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ); - mlir::Value resY = - mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY); - mlir::Value resX = - mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX); - mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY); - mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX); - - // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) + - // blockIdx.x; - // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) + - // ((threadIdx.z * blockDim.y) * blockDim.x) + - // (threadIdx.y * blockDim.x) + threadIdx.x + 1; - mlir::Value r1 = - mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY); - mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX); - mlir::Value r3 = - mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX); - mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); - mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX); - - mlir::Value bXbY = - mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY); - mlir::Value bXbYbZ = - mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ); - mlir::Value tZbY = - mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); - mlir::Value tZbYbX = - mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX); - mlir::Value tYbX = - mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); - mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ); - rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX); - rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX); - rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX); - mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); - rank = mlir::arith::AddIOp::create(builder, loc, rank, one); - - auto sizeFieldName = recTy.getTypeList()[1].first; - mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; - mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); - mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, sizeFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value sizeCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); - fir::StoreOp::create(builder, loc, size, sizeCoord); - - auto rankFieldName = recTy.getTypeList()[2].first; - mlir::Type rankFieldTy = recTy.getTypeList()[2].second; - mlir::Value rankFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, rankFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value rankCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); - fir::StoreOp::create(builder, loc, rank, rankCoord); - return res; -} - // THIS_IMAGE fir::ExtendedValue IntrinsicLibrary::genThisImage(mlir::Type resultType, @@ -8899,99 +7968,6 @@ IntrinsicLibrary::genThisImage(mlir::Type resultType, return builder.createConvert(loc, resultType, res); } -// THIS_THREAD_BLOCK -mlir::Value -IntrinsicLibrary::genThisThreadBlock(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0); - auto recTy = mlir::cast<fir::RecordType>(resultType); - assert(recTy && "RecordType expepected"); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Type i32Ty = builder.getI32Type(); - - // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x; - mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); - mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); - mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); - mlir::Value size = - mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY); - size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX); - - // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) + - // (threadIdx.y * blockDim.x) + threadIdx.x + 1; - mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); - mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); - mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); - mlir::Value r1 = - mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); - mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX); - mlir::Value r3 = - mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); - mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); - mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX); - mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); - rank = mlir::arith::AddIOp::create(builder, loc, rank, one); - - auto sizeFieldName = recTy.getTypeList()[1].first; - mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; - mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); - mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, sizeFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value sizeCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); - fir::StoreOp::create(builder, loc, size, sizeCoord); - - auto rankFieldName = recTy.getTypeList()[2].first; - mlir::Type rankFieldTy = recTy.getTypeList()[2].second; - mlir::Value rankFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, rankFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value rankCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); - fir::StoreOp::create(builder, loc, rank, rankCoord); - return res; -} - -// THIS_WARP -mlir::Value IntrinsicLibrary::genThisWarp(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0); - auto recTy = mlir::cast<fir::RecordType>(resultType); - assert(recTy && "RecordType expepected"); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Type i32Ty = builder.getI32Type(); - - // coalesced_group%size = 32 - mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32); - auto sizeFieldName = recTy.getTypeList()[1].first; - mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; - mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); - mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, sizeFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value sizeCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); - fir::StoreOp::create(builder, loc, size, sizeCoord); - - // coalesced_group%rank = threadIdx.x & 31 + 1 - mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); - mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31); - mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); - mlir::Value masked = - mlir::arith::AndIOp::create(builder, loc, threadIdX, mask); - mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one); - auto rankFieldName = recTy.getTypeList()[2].first; - mlir::Type rankFieldTy = recTy.getTypeList()[2].second; - mlir::Value rankFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, rankFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value rankCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); - fir::StoreOp::create(builder, loc, rank, rankCoord); - return res; -} - // TRAILZ mlir::Value IntrinsicLibrary::genTrailz(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { @@ -9213,65 +8189,6 @@ IntrinsicLibrary::genSum(mlir::Type resultType, resultType, args); } -// SYNCTHREADS -void IntrinsicLibrary::genSyncThreads(llvm::ArrayRef<fir::ExtendedValue> args) { - mlir::NVVM::Barrier0Op::create(builder, loc); -} - -// SYNCTHREADS_AND -mlir::Value -IntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32 = builder.getI32Type(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {resultType}, {i32}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - mlir::Value arg = builder.createConvert(loc, i32, args[0]); - return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); -} - -// SYNCTHREADS_COUNT -mlir::Value -IntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32 = builder.getI32Type(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {resultType}, {i32}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - mlir::Value arg = builder.createConvert(loc, i32, args[0]); - return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); -} - -// SYNCTHREADS_OR -mlir::Value -IntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32 = builder.getI32Type(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {resultType}, {i32}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - mlir::Value arg = builder.createConvert(loc, i32, args[0]); - return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); -} - -// SYNCWARP -void IntrinsicLibrary::genSyncWarp(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 1); - constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync"; - mlir::Value mask = fir::getBase(args[0]); - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> argsList{mask}; - fir::CallOp::create(builder, loc, funcOp, argsList); -} - // SYSTEM fir::ExtendedValue IntrinsicLibrary::genSystem(std::optional<mlir::Type> resultType, @@ -9403,38 +8320,6 @@ IntrinsicLibrary::genTranspose(mlir::Type resultType, return readAndAddCleanUp(resultMutableBox, resultType, "TRANSPOSE"); } -// THREADFENCE -void IntrinsicLibrary::genThreadFence(llvm::ArrayRef<fir::ExtendedValue> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); -} - -// THREADFENCE_BLOCK -void IntrinsicLibrary::genThreadFenceBlock( - llvm::ArrayRef<fir::ExtendedValue> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); -} - -// THREADFENCE_SYSTEM -void IntrinsicLibrary::genThreadFenceSystem( - llvm::ArrayRef<fir::ExtendedValue> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); -} - // TIME mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { @@ -9443,226 +8328,6 @@ mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType, fir::runtime::genTime(builder, loc)); } -// TMA_BULK_COMMIT_GROUP (CUDA) -void IntrinsicLibrary::genTMABulkCommitGroup( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 0); - mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); -} - -// TMA_BULK_G2S (CUDA) -void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); - mlir::Value dst = - convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]), - mlir::NVVM::NVVMMemorySpace::SharedCluster); - mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), - mlir::NVVM::NVVMMemorySpace::Global); - mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create( - builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); -} - -static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Value barrier, mlir::Value src, - mlir::Value dst, mlir::Value nelem, - mlir::Value eleSize) { - mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize); - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - barrier = builder.createConvert(loc, llvmPtrTy, barrier); - dst = builder.createConvert(loc, llvmPtrTy, dst); - src = builder.createConvert(loc, llvmPtrTy, src); - mlir::NVVM::InlinePtxOp::create( - builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, - "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " - "[%1], %2, [%3];", - {}); - mlir::NVVM::InlinePtxOp::create( - builder, loc, mlir::TypeRange{}, {barrier, size}, {}, - "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {}); -} - -// TMA_BULK_LOADC4 -void IntrinsicLibrary::genTMABulkLoadC4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADC8 -void IntrinsicLibrary::genTMABulkLoadC8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 16); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADI4 -void IntrinsicLibrary::genTMABulkLoadI4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADI8 -void IntrinsicLibrary::genTMABulkLoadI8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADR2 -void IntrinsicLibrary::genTMABulkLoadR2( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 2); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADR4 -void IntrinsicLibrary::genTMABulkLoadR4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADR8 -void IntrinsicLibrary::genTMABulkLoadR8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_S2G (CUDA) -void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]), - mlir::NVVM::NVVMMemorySpace::Shared); - mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), - mlir::NVVM::NVVMMemorySpace::Global); - mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( - builder, loc, dst, src, fir::getBase(args[2]), {}, {}); - - mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, - "cp.async.bulk.commit_group;", {}); - mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, - builder.getI32IntegerAttr(0), {}); -} - -static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Value src, mlir::Value dst, mlir::Value count, - mlir::Value eleSize) { - mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count); - src = convertPtrToNVVMSpace(builder, loc, src, - mlir::NVVM::NVVMMemorySpace::Shared); - dst = convertPtrToNVVMSpace(builder, loc, dst, - mlir::NVVM::NVVMMemorySpace::Global); - mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src, - size, {}, {}); - mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, - "cp.async.bulk.commit_group;", {}); - mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, - builder.getI32IntegerAttr(0), {}); -} - -// TMA_BULK_STORE_C4 (CUDA) -void IntrinsicLibrary::genTMABulkStoreC4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_C8 (CUDA) -void IntrinsicLibrary::genTMABulkStoreC8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 16); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_I4 (CUDA) -void IntrinsicLibrary::genTMABulkStoreI4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_I8 (CUDA) -void IntrinsicLibrary::genTMABulkStoreI8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_R2 (CUDA) -void IntrinsicLibrary::genTMABulkStoreR2( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 2); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_R4 (CUDA) -void IntrinsicLibrary::genTMABulkStoreR4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_R8 (CUDA) -void IntrinsicLibrary::genTMABulkStoreR8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_WAIT_GROUP (CUDA) -void IntrinsicLibrary::genTMABulkWaitGroup( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 0); - auto group = builder.getIntegerAttr(builder.getI32Type(), 0); - mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {}); -} - // TRIM fir::ExtendedValue IntrinsicLibrary::genTrim(mlir::Type resultType, @@ -10077,6 +8742,9 @@ getIntrinsicArgumentLowering(llvm::StringRef specificName) { if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name)) if (!ppcHandler->argLoweringRules.hasDefaultRules()) return &ppcHandler->argLoweringRules; + if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name)) + if (!cudaHandler->argLoweringRules.hasDefaultRules()) + return &cudaHandler->argLoweringRules; return nullptr; } From 6c9b5943f846839f8a188dd134a5a5a140d0c348 Mon Sep 17 00:00:00 2001 From: Mehdi Amini <joker.eph@gmail.com> Date: Tue, 4 Nov 2025 18:47:55 -0800 Subject: [PATCH 267/313] [MLIR] Fix generate-test-checks.py to not remove every blank lines (#166493) The stripping of the notes was done on a line-by-line basis which was fragile and led to remove empty lines everywhere in the file. Instead we can strip it as a single block before splitting the input into multiple lines. --- mlir/utils/generate-test-checks.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py index 3712a6b9c963d..22774468bb403 100755 --- a/mlir/utils/generate-test-checks.py +++ b/mlir/utils/generate-test-checks.py @@ -230,14 +230,11 @@ def process_line(line_chunks, variable_namer, use_ssa_name=False, strict_name_re # Process the source file lines. The source file doesn't have to be .mlir. -def process_source_lines(source_lines, note, args): +def process_source_lines(source_lines, args): source_split_re = re.compile(args.source_delim_regex) source_segments = [[]] for line in source_lines: - # Remove previous note. - if line in note: - continue # Remove previous CHECK lines. if line.find(args.check_prefix) != -1: continue @@ -359,9 +356,10 @@ def main(): source_segments = None if args.source: - source_segments = process_source_lines( - [l.rstrip() for l in open(args.source, "r")], autogenerated_note, args - ) + with open(args.source, "r") as f: + raw_source = f.read().replace(autogenerated_note, "") + raw_source_lines = [l.rstrip() for l in raw_source.splitlines()] + source_segments = process_source_lines(raw_source_lines, args) if args.inplace: assert args.output is None From a2977dea61d7eec82a8e7623b015bf695b7ef773 Mon Sep 17 00:00:00 2001 From: Hristo Hristov <hghristov.rmm@gmail.com> Date: Wed, 5 Nov 2025 05:12:17 +0200 Subject: [PATCH 268/313] [libc++][NFC] Removed unsupported compilers from tests (#166403) --- .../atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp | 2 +- .../new.delete/new.delete.array/sized_delete_array.pass.cpp | 1 - .../new.delete/new.delete.single/sized_delete.pass.cpp | 1 - libcxx/test/std/numerics/c.math/signbit.pass.cpp | 2 +- .../std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp | 3 --- .../std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp | 3 --- .../std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp | 3 --- .../numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp | 3 --- .../std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp | 3 --- .../util.smartptr.shared.create/shared_ptr_array.pass.cpp | 1 - .../std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp | 2 +- .../meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp | 2 +- 12 files changed, 4 insertions(+), 22 deletions(-) diff --git a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp index 13bd761ae9808..602bd1612015d 100644 --- a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp @@ -9,7 +9,7 @@ // https://github.com/llvm/llvm-project/issues/30023 // compare exchange does not work with types of which the size is not a power of 2 -// XFAIL: clang-19, clang-20, clang-21, apple-clang-15, apple-clang-16, apple-clang-17 +// XFAIL: clang-20, clang-21, apple-clang-17 // UNSUPPORTED: c++03 // TODO: remove the UNSUPPORTED clang-22 once libc++ CI's clang is updated to include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp index 731d751df08d9..dc4d8ae2851f4 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp @@ -11,7 +11,6 @@ // UNSUPPORTED: c++03, c++11 // These compiler versions and platforms don't enable sized deallocation by default. -// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp index 64a26ed63e8ce..834c01b2272e2 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp @@ -11,7 +11,6 @@ // UNSUPPORTED: c++03, c++11 // These compiler versions and platforms don't enable sized deallocation by default. -// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp index 7571ced2e4431..233e8ed2338b6 100644 --- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp +++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED: windows // These compilers don't support constexpr `__builtin_signbit` yet. -// UNSUPPORTED: clang-19, apple-clang-16, apple-clang-17 +// UNSUPPORTED: apple-clang-17 // GCC warns about signbit comparing `bool_v < 0`, which we're testing // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp index 6bd112c7d1280..f49e19acf0234 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp index bdfc57694dd53..0789213163847 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp index 1fe7916c67823..f09bf30771102 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp index b797ae7533add..86e2e61647be8 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class R, class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp index 8b6188f1fad0e..c2be8c5a47bdf 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp index f71f688afc52e..bb0b2d322218d 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp @@ -10,7 +10,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // These compiler versions and platforms don't enable sized deallocation by default. -// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp index 2e0b1e9025a61..b6b226c7f79d8 100644 --- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 // These compilers don't support __builtin_is_virtual_base_of yet. -// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17 +// UNSUPPORTED: apple-clang-17 // <type_traits> diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp index 1ca9d44b82afe..f43693c08bc39 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // These compilers don't support __builtin_is_implicit_lifetime yet. -// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17 +// UNSUPPORTED: apple-clang-17 // <type_traits> From 9cd1e4067873ac9d4348bae9de32a34575d1d64f Mon Sep 17 00:00:00 2001 From: Farzon Lotfi <farzonlotfi@microsoft.com> Date: Tue, 4 Nov 2025 22:22:24 -0500 Subject: [PATCH 269/313] [HLSL] Layout Initalizer list in Column order via index conversion (#166277) fixes #165663 The bug was that we were using the initalizer lists index to populate the matrix. This meant that [0..n] would coorelate to [0..n] indicies of the flattened matrix. Hence why we were seeing the Row-major order: [ 0 1 2 3 4 5 ]. To fix this we can simply converted these indicies to the Column-major order: [ 0 3 1 4 2 5 ]. The net effect of this is the layout of the matrix is now correct and we don't need to change the MatrixSubscriptExpr indexing scheme. --------- Co-authored-by: Deric C. <cheung.deric@gmail.com> Co-authored-by: Helena Kotas <hekotas@microsoft.com> --- clang/lib/Sema/SemaInit.cpp | 27 +++--- clang/test/AST/HLSL/matrix-constructors.hlsl | 91 +++++++++--------- .../AST/HLSL/matrix-general-initializer.hlsl | 86 ++++++++--------- .../BasicFeatures/MatrixConstructor.hlsl | 95 +++++++++++++++++++ 4 files changed, 198 insertions(+), 101 deletions(-) create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 073010d16b428..cc6ddf568d346 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -1897,26 +1897,29 @@ void InitListChecker::CheckMatrixType(const InitializedEntity &Entity, return; const ConstantMatrixType *MT = DeclType->castAs<ConstantMatrixType>(); + + // For HLSL, the error reporting for this case is handled in SemaHLSL's + // initializer list diagnostics. That means the execution should require + // getNumElementsFlattened to equal getNumInits. In other words the execution + // should never reach this point if this condition is not true". + assert(IList->getNumInits() == MT->getNumElementsFlattened() && + "Inits must equal Matrix element count"); + QualType ElemTy = MT->getElementType(); - const unsigned MaxElts = MT->getNumElementsFlattened(); - unsigned NumEltsInit = 0; + Index = 0; InitializedEntity ElemEnt = InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity); - while (NumEltsInit < MaxElts && Index < IList->getNumInits()) { + while (Index < IList->getNumInits()) { // Not a sublist: just consume directly. - ElemEnt.setElementIndex(Index); - CheckSubElementType(ElemEnt, IList, ElemTy, Index, StructuredList, + unsigned ColMajorIndex = (Index % MT->getNumRows()) * MT->getNumColumns() + + (Index / MT->getNumRows()); + ElemEnt.setElementIndex(ColMajorIndex); + CheckSubElementType(ElemEnt, IList, ElemTy, ColMajorIndex, StructuredList, StructuredIndex); - ++NumEltsInit; + ++Index; } - - // For HLSL The error for this case is handled in SemaHLSL's initializer - // list diagnostics, That means the execution should require NumEltsInit - // to equal Max initializers. In other words execution should never - // reach this point if this condition is not true". - assert(NumEltsInit == MaxElts && "NumEltsInit must equal MaxElts"); } void InitListChecker::CheckVectorType(const InitializedEntity &Entity, diff --git a/clang/test/AST/HLSL/matrix-constructors.hlsl b/clang/test/AST/HLSL/matrix-constructors.hlsl index 0a2f03c7c0fac..e1a9c53e2c602 100644 --- a/clang/test/AST/HLSL/matrix-constructors.hlsl +++ b/clang/test/AST/HLSL/matrix-constructors.hlsl @@ -9,21 +9,20 @@ typedef float float4 __attribute__((ext_vector_type(4))); [numthreads(1,1,1)] void ok() { - // CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} A 'float2x3':'matrix<float, 2, 3>' cinit // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>' functional cast to float2x3 <NoOp> // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6 float2x3 A = float2x3(1,2,3,4,5,6); @@ -57,6 +56,8 @@ void ok() { // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue @@ -68,12 +69,10 @@ void ok() { // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6 float2x3 D = float2x3(float2(1,2), 3, 4, 5, 6); @@ -97,9 +96,9 @@ void ok() { // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp> // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent @@ -107,10 +106,12 @@ void ok() { // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp> // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue @@ -120,9 +121,7 @@ void ok() { // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6 float2x3 E = float2x3(float2(1,2), float2(3,4), 5, 6); @@ -158,7 +157,7 @@ void ok() { // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue @@ -172,7 +171,9 @@ void ok() { // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue @@ -186,9 +187,7 @@ void ok() { // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 3 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6 float2x3 F = float2x3(float4(1,2,3,4), 5, 6); @@ -202,10 +201,10 @@ void ok() { // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 @@ -215,41 +214,41 @@ void ok() { // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp> // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp> // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6 float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); @@ -262,13 +261,13 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector<float, 2>' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector<float, 2>' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 float2 Vec2 = float2(1.0, 2.0); float2x2 H = float2x2(Vec2,3,4); @@ -281,10 +280,10 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'i' 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue> -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int' +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue> -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int' +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue> // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'l' 'int' @@ -300,15 +299,15 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue> +// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}} +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S' +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}} // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}} -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S' -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue> -// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}} // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S' struct S { float2 f; float a;} s; float2x2 J = float2x2(s.f, s.a, s.a); @@ -317,8 +316,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp> // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00 -// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 3.000000e+00 +// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 4.000000e+00 typedef float2x2 second_level_of_typedefs; second_level_of_typedefs L = float2x2(1.0f, 2.0f, 3.0f, 4.0f); @@ -327,8 +326,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'second_level_of_typedefs':'matrix<float, 2, 2>' functional cast to second_level_of_typedefs <NoOp> // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'second_level_of_typedefs':'matrix<float, 2, 2>' // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00 -// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 3.000000e+00 +// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 4.000000e+00 float2x2 M = second_level_of_typedefs(1.0f, 2.0f, 3.0f, 4.0f); @@ -367,12 +366,12 @@ float2x1 GettingStrange = float2x1(s2, s2); // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}} // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}} // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}} diff --git a/clang/test/AST/HLSL/matrix-general-initializer.hlsl b/clang/test/AST/HLSL/matrix-general-initializer.hlsl index 14c950acb7baf..1a631113eb0f0 100644 --- a/clang/test/AST/HLSL/matrix-general-initializer.hlsl +++ b/clang/test/AST/HLSL/matrix-general-initializer.hlsl @@ -26,14 +26,6 @@ void ok() { // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue> -// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent -// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue> @@ -42,20 +34,8 @@ void ok() { // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue> -// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent -// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating> -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' x -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue @@ -66,10 +46,30 @@ void ok() { // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent -// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx +// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat> -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' <LValueToRValue> +// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' xvalue vectorcomponent +// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xvalue +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xx +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' x +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 3 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'float' <IntegralToFloating> +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' <LValueToRValue> +// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' xvalue vectorcomponent +// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xvalue +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xx +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat> +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 4 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx}; @@ -84,12 +84,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx}; // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent @@ -105,12 +105,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx}; // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent @@ -138,7 +138,7 @@ S s = {m}; // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent @@ -146,7 +146,7 @@ S s = {m}; // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat> // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent @@ -169,25 +169,25 @@ float2x2 m2 = {0.xxxx}; // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> @@ -199,26 +199,26 @@ float2x2 m2 = {0.xxxx}; // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent @@ -229,25 +229,25 @@ float2x2 m2 = {0.xxxx}; // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral> // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue> diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl new file mode 100644 index 0000000000000..a7c01015b2015 --- /dev/null +++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl @@ -0,0 +1,95 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case1v( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <6 x float> <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 1.000000e+00, float 3.000000e+00, float 5.000000e+00> +// +float3x2 case1() { + // vec[0] = 0 + // vec[1] = 2 + // vec[2] = 4 + // vec[3] = 1 + // vec[4] = 3 + // vec[5] = 5 + return float3x2(0, 1, + 2, 3, + 4, 5); +} + + +RWStructuredBuffer<float> In; + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case2v( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 0) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: [[CALL1:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 1) #[[ATTR3]] +// CHECK-NEXT: [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 2) #[[ATTR3]] +// CHECK-NEXT: [[CALL3:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 3) #[[ATTR3]] +// CHECK-NEXT: [[CALL4:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 4) #[[ATTR3]] +// CHECK-NEXT: [[CALL5:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 5) #[[ATTR3]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[CALL]], align 4 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[CALL2]], align 4 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT]], float [[TMP1]], i32 1 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[CALL4]], align 4 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[TMP2]], i32 2 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[CALL1]], align 4 +// CHECK-NEXT: [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT7]], float [[TMP3]], i32 3 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[CALL3]], align 4 +// CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[TMP4]], i32 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[CALL5]], align 4 +// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT9]], float [[TMP5]], i32 5 +// CHECK-NEXT: ret <6 x float> [[VECINIT10]] +// +float3x2 case2() { + // vec[0] = Call + // vec[1] = Call2 + // vec[2] = Call4 + // vec[3] = Call1 + // vec[4] = Call3 + // vec[5] = Call5 + return float3x2(In[0], In[1], + In[2], In[3], + In[4], In[5]); +} + + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case3Dv3_fS_( +// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[A:%.*]], <3 x float> noundef nofpclass(nan inf) [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x float>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <3 x float>, align 16 +// CHECK-NEXT: store <3 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <3 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT:%.*]] = extractelement <3 x float> [[TMP0]], i64 0 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[VECEXT]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <3 x float> [[TMP1]], i64 2 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT]], float [[VECEXT1]], i32 1 +// CHECK-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <3 x float> [[TMP2]], i64 1 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[VECEXT3]], i32 2 +// CHECK-NEXT: [[TMP3:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT5:%.*]] = extractelement <3 x float> [[TMP3]], i64 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[VECEXT5]], i32 3 +// CHECK-NEXT: [[TMP4:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <3 x float> [[TMP4]], i64 0 +// CHECK-NEXT: [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[VECEXT7]], i32 4 +// CHECK-NEXT: [[TMP5:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT9:%.*]] = extractelement <3 x float> [[TMP5]], i64 2 +// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[VECEXT9]], i32 5 +// CHECK-NEXT: ret <6 x float> [[VECINIT10]] +// +float3x2 case3(float3 a, float3 b) { + // vec[0] = A[0] + // vec[1] = A[2] + // vec[2] = B[1] + // vec[3] = A[1] + // vec[4] = B[0] + // vec[5] = B[2] + return float3x2(a,b); +} From 0c73009236384033fc65be4fefb3c3b9b0d19c77 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov <barannikov88@gmail.com> Date: Wed, 5 Nov 2025 06:24:53 +0300 Subject: [PATCH 270/313] [WebAssembly] TableGen-erate SDNode descriptions (#166259) This allows SDNodes to be validated against their expected type profiles and reduces the number of changes required to add a new node. CALL and RET_CALL do not have a description in td files, and it is not currently possible to add one as these nodes have both variable operands and variable results. This also fixes a subtle bug detected by the enabled verification functionality. `LOCAL_GET` is declared with `SDNPHasChain` property, and thus should have both a chain operand and a chain result. The original code created a node without a chain result, which caused a check in `SDNodeInfo::verifyNode()` to fail. Part of #119709. Pull Request: https://github.com/llvm/llvm-project/pull/166259 --- llvm/lib/Target/WebAssembly/CMakeLists.txt | 1 + .../lib/Target/WebAssembly/WebAssemblyISD.def | 64 ------------------- .../WebAssembly/WebAssemblyISelLowering.cpp | 21 +----- .../WebAssembly/WebAssemblyISelLowering.h | 12 ---- .../WebAssemblySelectionDAGInfo.cpp | 24 ++++--- .../WebAssembly/WebAssemblySelectionDAGInfo.h | 17 ++++- 6 files changed, 34 insertions(+), 105 deletions(-) delete mode 100644 llvm/lib/Target/WebAssembly/WebAssemblyISD.def diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt index 1e83cbeac50d6..17df119d62709 100644 --- a/llvm/lib/Target/WebAssembly/CMakeLists.txt +++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt @@ -10,6 +10,7 @@ tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel) tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info) tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM WebAssemblyGenSDNodeInfo.inc -gen-sd-node-info) tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget) add_public_tablegen_target(WebAssemblyCommonTableGen) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def deleted file mode 100644 index 23108e429eda8..0000000000000 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ /dev/null @@ -1,64 +0,0 @@ -//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file describes the various WebAssembly ISD node types. -/// -//===----------------------------------------------------------------------===// - -// NOTE: NO INCLUDE GUARD DESIRED! - -HANDLE_NODETYPE(CALL) -HANDLE_NODETYPE(RET_CALL) -HANDLE_NODETYPE(RETURN) -HANDLE_NODETYPE(ARGUMENT) -HANDLE_NODETYPE(LOCAL_GET) -HANDLE_NODETYPE(LOCAL_SET) -// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol -HANDLE_NODETYPE(Wrapper) -// A special node for TargetGlobalAddress used in PIC code for -// __memory_base/__table_base relative access. -HANDLE_NODETYPE(WrapperREL) -HANDLE_NODETYPE(BR_IF) -HANDLE_NODETYPE(BR_TABLE) -HANDLE_NODETYPE(DOT) -HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U) -HANDLE_NODETYPE(EXT_ADD_PAIRWISE_S) -HANDLE_NODETYPE(SHUFFLE) -HANDLE_NODETYPE(SWIZZLE) -HANDLE_NODETYPE(VEC_SHL) -HANDLE_NODETYPE(VEC_SHR_S) -HANDLE_NODETYPE(VEC_SHR_U) -HANDLE_NODETYPE(NARROW_U) -HANDLE_NODETYPE(EXTEND_LOW_S) -HANDLE_NODETYPE(EXTEND_LOW_U) -HANDLE_NODETYPE(EXTEND_HIGH_S) -HANDLE_NODETYPE(EXTEND_HIGH_U) -HANDLE_NODETYPE(CONVERT_LOW_S) -HANDLE_NODETYPE(CONVERT_LOW_U) -HANDLE_NODETYPE(PROMOTE_LOW) -HANDLE_NODETYPE(TRUNC_SAT_ZERO_S) -HANDLE_NODETYPE(TRUNC_SAT_ZERO_U) -HANDLE_NODETYPE(DEMOTE_ZERO) -HANDLE_NODETYPE(I64_ADD128) -HANDLE_NODETYPE(I64_SUB128) -HANDLE_NODETYPE(I64_MUL_WIDE_S) -HANDLE_NODETYPE(I64_MUL_WIDE_U) - -// Memory intrinsics -HANDLE_NODETYPE(GLOBAL_GET) -HANDLE_NODETYPE(GLOBAL_SET) -HANDLE_NODETYPE(TABLE_GET) -HANDLE_NODETYPE(TABLE_SET) - -// Bulk memory instructions. These follow LLVM's expected semantics of -// supporting out-of-bounds pointers if the length is zero, by inserting -// a branch around Wasm's `memory.copy` and `memory.fill`, which would -// otherwise trap. -HANDLE_NODETYPE(MEMCPY) -HANDLE_NODETYPE(MEMSET) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 7ec463bdc3b84..af322982d5355 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -942,20 +942,6 @@ MachineBasicBlock *WebAssemblyTargetLowering::EmitInstrWithCustomInserter( } } -const char * -WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) { - case WebAssemblyISD::FIRST_NUMBER: - break; -#define HANDLE_NODETYPE(NODE) \ - case WebAssemblyISD::NODE: \ - return "WebAssemblyISD::" #NODE; -#include "WebAssemblyISD.def" -#undef HANDLE_NODETYPE - } - return nullptr; -} - std::pair<unsigned, const TargetRegisterClass *> WebAssemblyTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { @@ -1830,11 +1816,8 @@ SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op, SDValue Idx = DAG.getTargetConstant(*Local, Base, MVT::i32); EVT LocalVT = LN->getValueType(0); - SDValue LocalGet = DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, LocalVT, - {LN->getChain(), Idx}); - SDValue Result = DAG.getMergeValues({LocalGet, LN->getChain()}, DL); - assert(Result->getNumValues() == 2 && "Loads must carry a chain!"); - return Result; + return DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, {LocalVT, MVT::Other}, + {LN->getChain(), Idx}); } if (WebAssembly::isWasmVarAddressSpace(LN->getAddressSpace())) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index 472ec678534a4..f7052989b3c75 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -19,17 +19,6 @@ namespace llvm { -namespace WebAssemblyISD { - -enum NodeType : unsigned { - FIRST_NUMBER = ISD::BUILTIN_OP_END, -#define HANDLE_NODETYPE(NODE) NODE, -#include "WebAssemblyISD.def" -#undef HANDLE_NODETYPE -}; - -} // end namespace WebAssemblyISD - class WebAssemblySubtarget; class WebAssemblyTargetLowering final : public TargetLowering { @@ -53,7 +42,6 @@ class WebAssemblyTargetLowering final : public TargetLowering { MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; - const char *getTargetNodeName(unsigned Opcode) const override; std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp index 2673c81eae40b..cf5cc41ea565b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp @@ -11,23 +11,31 @@ /// //===----------------------------------------------------------------------===// +#include "WebAssemblySelectionDAGInfo.h" #include "WebAssemblyTargetMachine.h" + +#define GET_SDNODE_DESC +#include "WebAssemblyGenSDNodeInfo.inc" + using namespace llvm; #define DEBUG_TYPE "wasm-selectiondag-info" +WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo() + : SelectionDAGGenTargetInfo(WebAssemblyGenSDNodeInfo) {} + WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor -bool WebAssemblySelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const { +const char * +WebAssemblySelectionDAGInfo::getTargetNodeName(unsigned Opcode) const { switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) { - default: - return false; - case WebAssemblyISD::GLOBAL_GET: - case WebAssemblyISD::GLOBAL_SET: - case WebAssemblyISD::TABLE_GET: - case WebAssemblyISD::TABLE_SET: - return true; + case WebAssemblyISD::CALL: + return "WebAssemblyISD::CALL"; + case WebAssemblyISD::RET_CALL: + return "WebAssemblyISD::RET_CALL"; } + + return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode); } SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy( diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h index 69c9af0966308..8775f4946d88d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h @@ -17,13 +17,26 @@ #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#define GET_SDNODE_ENUM +#include "WebAssemblyGenSDNodeInfo.inc" + namespace llvm { +namespace WebAssemblyISD { + +enum NodeType : unsigned { + CALL = GENERATED_OPCODE_END, + RET_CALL, +}; -class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo { +} // namespace WebAssemblyISD + +class WebAssemblySelectionDAGInfo final : public SelectionDAGGenTargetInfo { public: + WebAssemblySelectionDAGInfo(); + ~WebAssemblySelectionDAGInfo() override; - bool isTargetMemoryOpcode(unsigned Opcode) const override; + const char *getTargetNodeName(unsigned Opcode) const override; SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, From 952d4b4c0bc959afe6bf18a7550fab024ab5a9b8 Mon Sep 17 00:00:00 2001 From: Morris Hafner <mmha@users.noreply.github.com> Date: Wed, 5 Nov 2025 11:35:11 +0800 Subject: [PATCH 271/313] [CIR] Fix assignment ignore in ScalarExprEmitter (#166118) We are missing a couple of cases were we are not supposed to ignore assignment results but did so, which results in compiler crashes. Fix that. Also start ignoring IgnoredExprs unless there's side effects (assignments) inside. --- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 2 +- clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 23 ++++- clang/lib/CIR/CodeGen/CIRGenFunction.h | 3 +- clang/test/CIR/CodeGen/binassign.c | 104 +++++++++++++++++++++ 4 files changed, 126 insertions(+), 6 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index 5ccb431e626ae..4fb178df0e508 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -1630,7 +1630,7 @@ RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot, bool ignoreResult) { switch (CIRGenFunction::getEvaluationKind(e->getType())) { case cir::TEK_Scalar: - return RValue::get(emitScalarExpr(e)); + return RValue::get(emitScalarExpr(e, ignoreResult)); case cir::TEK_Complex: return RValue::getComplex(emitComplexExpr(e)); case cir::TEK_Aggregate: { diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 119314fe27dce..5eba5ba6c3df1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -78,11 +78,15 @@ struct BinOpInfo { class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> { CIRGenFunction &cgf; CIRGenBuilderTy &builder; + // Unlike classic codegen we set this to false or use std::exchange to read + // the value instead of calling TestAndClearIgnoreResultAssign to make it + // explicit when the value is used bool ignoreResultAssign; public: - ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder) - : cgf(cgf), builder(builder) {} + ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder, + bool ignoreResultAssign = false) + : cgf(cgf), builder(builder), ignoreResultAssign(ignoreResultAssign) {} //===--------------------------------------------------------------------===// // Utilities @@ -221,6 +225,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> { } mlir::Value VisitArraySubscriptExpr(ArraySubscriptExpr *e) { + ignoreResultAssign = false; + if (e->getBase()->getType()->isVectorType()) { assert(!cir::MissingFeatures::scalableVectors()); @@ -839,6 +845,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> { BinOpInfo emitBinOps(const BinaryOperator *e, QualType promotionType = QualType()) { + ignoreResultAssign = false; BinOpInfo result; result.lhs = cgf.emitPromotedScalarExpr(e->getLHS(), promotionType); result.rhs = cgf.emitPromotedScalarExpr(e->getRHS(), promotionType); @@ -924,6 +931,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> { #undef HANDLEBINOP mlir::Value emitCmp(const BinaryOperator *e) { + ignoreResultAssign = false; const mlir::Location loc = cgf.getLoc(e->getExprLoc()); mlir::Value result; QualType lhsTy = e->getLHS()->getType(); @@ -1406,11 +1414,13 @@ CIRGenFunction::emitCompoundAssignmentLValue(const CompoundAssignOperator *e) { } /// Emit the computation of the specified expression of scalar type. -mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e) { +mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e, + bool ignoreResultAssign) { assert(e && hasScalarEvaluationKind(e->getType()) && "Invalid scalar expression to emit"); - return ScalarExprEmitter(*this, builder).Visit(const_cast<Expr *>(e)); + return ScalarExprEmitter(*this, builder, ignoreResultAssign) + .Visit(const_cast<Expr *>(e)); } mlir::Value CIRGenFunction::emitPromotedScalarExpr(const Expr *e, @@ -2054,6 +2064,11 @@ mlir::Value ScalarExprEmitter::VisitMemberExpr(MemberExpr *e) { mlir::Value ScalarExprEmitter::VisitInitListExpr(InitListExpr *e) { const unsigned numInitElements = e->getNumInits(); + [[maybe_unused]] const bool ignore = std::exchange(ignoreResultAssign, false); + assert((ignore == false || + (numInitElements == 0 && e->getType()->isVoidType())) && + "init list ignored"); + if (e->hadArrayRangeDesignator()) { cgf.cgm.errorNYI(e->getSourceRange(), "ArrayRangeDesignator"); return {}; diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index e5cecaa573a6e..dece642eb13b6 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1501,7 +1501,8 @@ class CIRGenFunction : public CIRGenTypeCache { llvm::ArrayRef<mlir::Value> args = {}); /// Emit the computation of the specified expression of scalar type. - mlir::Value emitScalarExpr(const clang::Expr *e); + mlir::Value emitScalarExpr(const clang::Expr *e, + bool ignoreResultAssign = false); mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv, cir::UnaryOpKind kind, bool isPre); diff --git a/clang/test/CIR/CodeGen/binassign.c b/clang/test/CIR/CodeGen/binassign.c index 44c54b4a2969a..4520063c56ee6 100644 --- a/clang/test/CIR/CodeGen/binassign.c +++ b/clang/test/CIR/CodeGen/binassign.c @@ -100,3 +100,107 @@ void binary_assign_struct() { // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LS_PTR]], ptr align 4 @gs, i64 8, i1 false) // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LSV_PTR]], ptr align 4 @gsv, i64 8, i1 true) // OGCG: ret void + +int ignore_result_assign() { + int arr[10]; + int i, j; + j = i = 123, 0; + j = arr[i = 5]; + int *p, *q = 0; + if(p = q) + return 1; + return 0; +} + +// CIR-LABEL: cir.func{{.*}} @ignore_result_assign() -> !s32i +// CIR: %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] +// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 10>, !cir.ptr<!cir.array<!s32i x 10>>, ["arr"] +// CIR: %[[I:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i"] +// CIR: %[[J:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["j"] +// CIR: %[[P:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p"] +// CIR: %[[Q:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["q", init] +// CIR: %[[VAL_123:.*]] = cir.const #cir.int<123> : !s32i +// CIR: cir.store{{.*}} %[[VAL_123]], %[[I]] : !s32i, !cir.ptr<!s32i> +// CIR: cir.store{{.*}} %[[VAL_123]], %[[J]] : !s32i, !cir.ptr<!s32i> +// CIR: %[[VAL_0:.*]] = cir.const #cir.int<0> : !s32i +// CIR: %[[VAL_5:.*]] = cir.const #cir.int<5> : !s32i +// CIR: cir.store{{.*}} %[[VAL_5]], %[[I]] : !s32i, !cir.ptr<!s32i> +// CIR: %[[ARR_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 10>> -> !cir.ptr<!s32i> +// CIR: %[[ARR_ELEM:.*]] = cir.ptr_stride %[[ARR_DECAY]], %[[VAL_5]] : (!cir.ptr<!s32i>, !s32i) -> !cir.ptr<!s32i> +// CIR: %[[ARR_LOAD:.*]] = cir.load{{.*}} %[[ARR_ELEM]] : !cir.ptr<!s32i>, !s32i +// CIR: cir.store{{.*}} %[[ARR_LOAD]], %[[J]] : !s32i, !cir.ptr<!s32i> +// CIR: %[[NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i> +// CIR: cir.store{{.*}} %[[NULL]], %[[Q]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>> +// CIR: cir.scope { +// CIR: %[[Q_VAL:.*]] = cir.load{{.*}} %[[Q]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i> +// CIR: cir.store{{.*}} %[[Q_VAL]], %[[P]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>> +// CIR: %[[COND:.*]] = cir.cast ptr_to_bool %[[Q_VAL]] : !cir.ptr<!s32i> -> !cir.bool +// CIR: cir.if %[[COND]] { +// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i +// CIR: cir.store %[[ONE]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i> +// CIR: %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i +// CIR: cir.return +// CIR: } +// CIR: } +// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i +// CIR: cir.store %[[ZERO]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i> +// CIR: %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i +// CIR: cir.return + +// LLVM-LABEL: define {{.*}}i32 @ignore_result_assign() +// LLVM: %[[RETVAL_PTR:.*]] = alloca i32 +// LLVM: %[[ARR_PTR:.*]] = alloca [10 x i32] +// LLVM: %[[I_PTR:.*]] = alloca i32 +// LLVM: %[[J_PTR:.*]] = alloca i32 +// LLVM: %[[P_PTR:.*]] = alloca ptr +// LLVM: %[[Q_PTR:.*]] = alloca ptr +// LLVM: store i32 123, ptr %[[I_PTR]] +// LLVM: store i32 123, ptr %[[J_PTR]] +// LLVM: store i32 5, ptr %[[I_PTR]] +// LLVM: %[[GEP1:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0 +// LLVM: %[[GEP2:.*]] = getelementptr i32, ptr %[[GEP1]], i64 5 +// LLVM: %[[ARR_VAL:.*]] = load i32, ptr %[[GEP2]] +// LLVM: store i32 %[[ARR_VAL]], ptr %[[J_PTR]] +// LLVM: store ptr null, ptr %[[Q_PTR]] +// LLVM: br label +// LLVM: %[[Q_VAL:.*]] = load ptr, ptr %[[Q_PTR]] +// LLVM: store ptr %[[Q_VAL]], ptr %[[P_PTR]] +// LLVM: %[[CMP:.*]] = icmp ne ptr %[[Q_VAL]], null +// LLVM: br i1 %[[CMP]], label %[[THEN:.*]], label %[[ELSE:.*]] +// LLVM: [[THEN]]: +// LLVM: store i32 1, ptr %[[RETVAL_PTR]] +// LLVM: %{{.*}} = load i32, ptr %[[RETVAL_PTR]] +// LLVM: ret i32 +// LLVM: [[ELSE]]: +// LLVM: br label +// LLVM: store i32 0, ptr %[[RETVAL_PTR]] +// LLVM: %{{.*}} = load i32, ptr %[[RETVAL_PTR]] +// LLVM: ret i32 + +// OGCG-LABEL: define {{.*}}i32 @ignore_result_assign() +// OGCG: %[[RETVAL:.*]] = alloca i32 +// OGCG: %[[ARR:.*]] = alloca [10 x i32] +// OGCG: %[[I:.*]] = alloca i32 +// OGCG: %[[J:.*]] = alloca i32 +// OGCG: %[[P:.*]] = alloca ptr +// OGCG: %[[Q:.*]] = alloca ptr +// OGCG: store i32 123, ptr %[[I]] +// OGCG: store i32 123, ptr %[[J]] +// OGCG: store i32 5, ptr %[[I]] +// OGCG: %[[ARRAYIDX:.*]] = getelementptr inbounds [10 x i32], ptr %[[ARR]], i64 0, i64 5 +// OGCG: %[[ARR_VAL:.*]] = load i32, ptr %[[ARRAYIDX]] +// OGCG: store i32 %[[ARR_VAL]], ptr %[[J]] +// OGCG: store ptr null, ptr %[[Q]] +// OGCG: %[[Q_VAL:.*]] = load ptr, ptr %[[Q]] +// OGCG: store ptr %[[Q_VAL]], ptr %[[P]] +// OGCG: %[[TOBOOL:.*]] = icmp ne ptr %[[Q_VAL]], null +// OGCG: br i1 %[[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// OGCG: [[IF_THEN]]: +// OGCG: store i32 1, ptr %[[RETVAL]] +// OGCG: br label %[[RETURN:.*]] +// OGCG: [[IF_END]]: +// OGCG: store i32 0, ptr %[[RETVAL]] +// OGCG: br label %[[RETURN]] +// OGCG: [[RETURN]]: +// OGCG: %{{.*}} = load i32, ptr %[[RETVAL]] +// OGCG: ret i32 From f291f335c9628ea8d855fcc7c246171d70ceff58 Mon Sep 17 00:00:00 2001 From: Nishant Patel <nishant.b.patel@intel.com> Date: Tue, 4 Nov 2025 19:37:08 -0800 Subject: [PATCH 272/313] [MLIR][XeGPU] Support order attribute and add pattern for vector.transpose in WgToSg Pass (#165307) This PR does the following: 1. Handle order attribute during the delinearization from linear subgroup Id to multi-dim id. 2. Adds a transformation pattern for vector.transpose in wg to sg pass. 3. Updates CHECKS in the wg to sg tests --- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 85 ++++++++-- .../Transforms/XeGPUWgToSgDistribute.cpp | 81 ++++++++-- .../Dialect/XeGPU/subgroup-distribute.mlir | 42 ++--- .../Dialect/XeGPU/xegpu-attr-interface.mlir | 39 +++-- .../XeGPU/xegpu-wg-to-sg-elemwise.mlir | 6 +- .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 88 +++++------ .../XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir | 64 ++++---- .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 145 ++++++++++-------- mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 123 +++++++-------- 9 files changed, 387 insertions(+), 286 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 397107b786c9e..fb5d1e758dbd1 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -280,27 +280,82 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError, FailureOr<SmallVector<Value>> LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) { - // TODO: handle order attribute - auto hasDefaultOrder = [&]() { - DenseI32ArrayAttr order = getOrder(); - return !order || isIdentityPermutation(llvm::to_vector_of<int64_t>( - llvm::reverse(order.asArrayRef()))); - }; - if (!hasDefaultOrder()) - return mlir::emitError(loc, "order attribute is currently not supported."); - SmallVector<int64_t> layout; + SmallVector<int64_t> sgLayoutInt; if (isForWorkgroup()) { - layout = getEffectiveSgLayoutAsInt(); + sgLayoutInt = getEffectiveSgLayoutAsInt(); } else if (isForSubgroup()) { - layout = getEffectiveLaneLayoutAsInt(); + sgLayoutInt = getEffectiveLaneLayoutAsInt(); } else { return failure(); } - auto dims = llvm::map_to_vector(layout, [&](int64_t d) -> Value { - return builder.createOrFold<arith::ConstantIndexOp>(loc, d); - }); - return affine::delinearizeIndex(builder, loc, linearId, dims); + DenseI32ArrayAttr orderAttr = getOrder(); + + // Handle order attribute + SmallVector<int64_t> order; + if (orderAttr && !orderAttr.empty()) { + order = llvm::to_vector( + llvm::map_range(orderAttr.asArrayRef(), + [](int32_t idx) { return static_cast<int64_t>(idx); })); + } else { + // Default order: [1, 0] for 2D (row-major), [2, 1, 0] for 3D, etc. + order = llvm::to_vector( + llvm::reverse(llvm::seq<int64_t>(0, sgLayoutInt.size()))); + } + + if (order.size() != sgLayoutInt.size()) { + return failure(); + } + + SmallVector<Value> result(sgLayoutInt.size()); + Value remaining = linearId; + + /// Process dimensions in the order they appear in the order array + /// The first dimension in order is the fastest-changing + /// + /// Example walkthrough for linearId=22, sgLayout=[2,4,4], order=[2,1,0]: + /// + /// Initial: remaining=22, dimIdx = order[i], dimSize = sgLayout[dimIdx], + /// result=[?,?,?] + /// + /// i=0 (process columns, dimIdx=2, dimSize=4): + /// result[2] = 22 % 4 = 2 (column coordinate) + /// remaining = 22 / 4 = 5 (5 complete groups of 4 columns processed) + /// + /// i=1 (process rows, dimIdx=1, dimSize=4): + /// result[1] = 5 % 4 = 1 (row coordinate) + /// remaining = 5 / 4 = 1 (1 complete group of 4 rows processed) + /// + /// i=2 (process layers, dimIdx=0, dimSize=2): + /// result[0] = 1 % 2 = 1 (layer coordinate) + /// (no remaining update - last iteration) + /// + /// Final result: [1,1,2] = Layer 1, Row 1, Column 2 + for (size_t i = 0; i < order.size(); ++i) { + int64_t dimIdx = order[i]; + int64_t dimSize = sgLayoutInt[dimIdx]; + + Value dimSizeVal = + builder.createOrFold<arith::ConstantIndexOp>(loc, dimSize); + + /// Extract the coordinate for this dimension using modulo operation + /// This gives us "how far within this dimension" we are + /// e.g., linearId=22, dimSize=4: 22 % 4 = 2 (we're at position 2 within + /// this dimension) + result[dimIdx] = + builder.createOrFold<index::RemUOp>(loc, remaining, dimSizeVal); + + /// Update remaining for the next dimension by removing what we've already + /// processed. Division tells us "how many complete groups of this dimension + /// we've gone through" e.g., linearId=22, dimSize=4: 22 / 4 = 5 (we've + /// completed 5 groups of 4) Skip this for the last iteration since there's + /// no next dimension to process + if (i < order.size() - 1) { + remaining = + builder.createOrFold<index::DivUOp>(loc, remaining, dimSizeVal); + } + } + return result; } /// Implements DistributeLayoutAttr::computeDistributedCoords to generate diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index d12a04df5c46c..0a9ef0aa6df96 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -1219,6 +1219,70 @@ struct WgToSgMultiDimReductionOp } }; +// This pattern transforms vector.transpose ops to work at subgroup level. +struct WgToSgVectorTransposeOp + : public OpConversionPattern<vector::TransposeOp> { + using OpConversionPattern<vector::TransposeOp>::OpConversionPattern; + + LogicalResult + matchAndRewrite(vector::TransposeOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + VectorType resultType = op.getResultVectorType(); + + ArrayRef<int64_t> wgShape = resultType.getShape(); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(op.getResult()); + if (!layout || !layout.isForWorkgroup()) + return failure(); + + xegpu::DistributeLayoutAttr sourceLayout = + xegpu::getDistributeLayoutAttr(op.getVector()); + if (!sourceLayout || !sourceLayout.isForWorkgroup()) + return failure(); + + SmallVector<int64_t> sourceSgLayout = + sourceLayout.getEffectiveSgLayoutAsInt(); + SmallVector<int64_t> resultSgLayout = layout.getEffectiveSgLayoutAsInt(); + DenseI32ArrayAttr sourceOrder = sourceLayout.getOrder(); + DenseI32ArrayAttr resultOrder = layout.getOrder(); + + if (!sourceOrder || !resultOrder) { + return rewriter.notifyMatchFailure( + op, "Both source and result must have order attributes"); + } + + ArrayRef<int64_t> permutation = op.getPermutation(); + size_t permutationSize = permutation.size(); + if (sourceSgLayout.size() != permutationSize || + resultSgLayout.size() != permutationSize) { + return rewriter.notifyMatchFailure( + op, "Layouts and permutation must have the same rank"); + } + + // Check that sgLayout, sgData & order are properly transposed for source + // and result + if (!layout.isTransposeOf(sourceLayout, permutation)) + return rewriter.notifyMatchFailure( + op, "Result layout is not a valid transpose of source layout " + "according to permutation"); + + SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first; + VectorType newResultType = + VectorType::get(sgShape, resultType.getElementType()); + SmallVector<Value> newTransposeOps; + for (auto src : adaptor.getVector()) { + auto newTranspose = vector::TransposeOp::create( + rewriter, op.getLoc(), newResultType, src, permutation); + xegpu::setDistributeLayoutAttr(newTranspose->getResult(0), + layout.dropSgLayoutAndData()); + newTransposeOps.push_back(newTranspose.getResult()); + } + + rewriter.replaceOpWithMultiple(op, {newTransposeOps}); + return success(); + } +}; + } // namespace namespace mlir { @@ -1233,7 +1297,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) { WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset, WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp, WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp, - WgToSgMultiDimReductionOp>(patterns.getContext()); + WgToSgMultiDimReductionOp, WgToSgVectorTransposeOp>( + patterns.getContext()); } } // namespace xegpu } // namespace mlir @@ -1360,7 +1425,9 @@ void XeGPUWgToSgDistributePass::runOnOperation() { return isLegal(layout); }); - target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp>( + target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp, + vector::TransposeOp, vector::BroadcastOp, + vector::MultiDimReductionOp>( [=](Operation *op) -> bool { // Check for either a SliceAttr or LayoutAttr on the result. auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0)); @@ -1379,16 +1446,6 @@ void XeGPUWgToSgDistributePass::runOnOperation() { return isLegal(layout); }); - target.addDynamicallyLegalOp<vector::BroadcastOp>( - [=](vector::BroadcastOp op) -> bool { - return isLegal(xegpu::getDistributeLayoutAttr(op.getResult())); - }); - - target.addDynamicallyLegalOp<vector::MultiDimReductionOp>( - [=](vector::MultiDimReductionOp op) -> bool { - return isLegal(xegpu::getDistributeLayoutAttr(op.getResult())); - }); - target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>( [=](xegpu::ConvertLayoutOp op) -> bool { return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout()); diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 8946d14e80b72..8fd3cca5594cb 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -268,15 +268,16 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_store_matrix_1({{.*}}) { -// CHECK: %[[LAYOUT_X:.*]] = arith.constant 8 : index -// CHECK: %[[LAYOUT_Y:.*]] = arith.constant 2 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index // CHECK: %[[LANE_ID:.*]] = gpu.lane_id -// CHECK: %[[DELINEARIZED_LANE_Y:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]] -// CHECK: %[[DELINEARIZED_LANE_X:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]] -// CHECK: %[[LANE_Y_OFFSET:.*]] = index.remu %[[DELINEARIZED_LANE_Y]], %[[LAYOUT_Y]] -// CHECK: %[[LANE_X_OFFSET:.*]] = index.remu %[[DELINEARIZED_LANE_X]], %[[LAYOUT_X]] -// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32> -// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index +// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C8]] +// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C8]] +// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C2]] +// CHECK: %[[REMU3:.*]] = index.remu %[[REMU2]], %[[C2]] +// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C8]] +// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[REMU4]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32> +// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[REMU4]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index gpu.module @xevm_module{ gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) { %c0 = arith.constant 0 : index @@ -288,19 +289,20 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_store_matrix_2({{.*}}) { -// CHECK: %[[DIST_UNIT_HEIGHT_X:.*]] = arith.constant 4 : index -// CHECK: %[[DIST_UNIT_HEIGHT_Y:.*]] = arith.constant 8 : index -// CHECK: %[[LANE_DATA_Y:.*]] = arith.constant 2 : index -// CHECK: %[[USER_OFFSET_X:.*]] = arith.constant 1 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C4:.*]] = arith.constant 4 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[LANE_ID:.*]] = gpu.lane_id -// CHECK: %[[DELINEARIZED_LANE_Y:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]] -// CHECK: %[[DELINEARIZED_LANE_X:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]] -// CHECK: %[[LANE_Y_OFFSET_1:.*]] = index.mul %[[DELINEARIZED_LANE_Y]], %[[LANE_DATA_Y]] -// CHECK: %[[LANE_Y_OFFSET:.*]] = index.remu %[[LANE_Y_OFFSET_1]], %[[DIST_UNIT_HEIGHT_Y]] -// CHECK: %[[LANE_X_OFFSET_1:.*]] = index.remu %[[DELINEARIZED_LANE_X]], %[[DIST_UNIT_HEIGHT_X]] -// CHECK: %[[LANE_X_OFFSET:.*]] = index.add %[[LANE_X_OFFSET_1]], %[[USER_OFFSET_X]] -// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32> -// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index +// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C4]] +// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C4]] +// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C4]] +// CHECK: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2]] +// CHECK: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C8]] +// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C4]] +// CHECK: %[[ADD:.*]] = index.add %[[REMU4]], %[[C1]] +// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32> +// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[ADD]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index gpu.module @xevm_module{ gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) { %c0 = arith.constant 0 : index diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir index b73bc69393dab..02c5f71d5c83d 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir @@ -1,33 +1,32 @@ // RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s -//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)> gpu.module @test { gpu.func @slice_attr() -> vector<128xindex> { - //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]] - //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] - //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> - //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> - //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C8:.*]] + // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU]], %[[C4:.*]] + // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]] + // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]] + // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex> + // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex> + // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex> %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex> gpu.return %step : vector<128xindex> } gpu.func @nested_slice_attr() -> vector<128xindex> { - //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]] - //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] - //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> - //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> - //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[DIVU1:.*]] = index.divu %[[SGID]], %[[C1:.*]] + // CHECK-DAG: %[[DIVU2:.*]] = index.divu %[[DIVU1]], %[[C8:.*]] + // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU2]], %[[C4:.*]] + // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]] + // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]] + // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex> + // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex> + // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex> %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex> gpu.return %0 : vector<128xindex> } -} \ No newline at end of file +} + diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir index 09df1e4da43e2..9580769d37313 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir @@ -166,14 +166,12 @@ gpu.module @test_elementwise_ops { %load_b = xegpu.load_nd %tdesc_b : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>> -> vector<24x32xf32> - // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} - // CHECK-SAME-COUNT-12: : vector<2x2xf32> + // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32> // CHECK-NOT: arith.negf %negf = arith.negf %load_a {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>} : vector<24x32xf32> - // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} - // CHECK-SAME-COUNT-12: : vector<2x2xf32> + // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32> // CHECK-NOT: math.powf %powf = math.powf %load_a, %load_b {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>} diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index d2d250cbe0f66..01134d8eaabec 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -1,14 +1,10 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s -#map = affine_map<()[s0] -> (s0 floordiv 4)> -#map1 = affine_map<()[s0] -> (s0 mod 4)> - gpu.module @test_round_robin_assignment { // CHECK-LABEL: create_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NOT: xegpu.create_nd_tdesc %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -16,22 +12,23 @@ gpu.module @test_round_robin_assignment { } // CHECK-LABEL: create_nd_tdesc_with_shared_data - // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32> + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) { - //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK: [[IdY:%.+]] = affine.apply #map()[[[sgId]]] - //CHECK: [[IdX:%.+]] = affine.apply #map1()[[[sgId]]] - //CHECK: [[C16:%.+]] = arith.constant 16 : index - //CHECK: [[LY:%.+]] = index.mul [[IdY]], [[C16]] - //CHECK: [[C64:%.+]] = arith.constant 64 : index - //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]] - //CHECK: [[C0:%.+]] = arith.constant 0 : index - //CHECK: [[C0_1:%.+]] = arith.constant 0 : index - //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]] - //CHECK: [[C64_2:%.+]] = arith.constant 64 : index - //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]] - //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32> + // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK: %[[C4:.*]] = arith.constant 4 : index + // CHECK: %[[IDX:.*]] = index.remu %[[SGID]], %[[C4]] + // CHECK: %[[IDY_DIV:.*]] = index.divu %[[SGID]], %[[C4]] + // CHECK: %[[C8:.*]] = arith.constant 8 : index + // CHECK: %[[IDY:.*]] = index.remu %[[IDY_DIV]], %[[C8]] + // CHECK: %[[C16:.*]] = arith.constant 16 : index + // CHECK: %[[LY:.*]] = index.mul %[[IDY]], %[[C16]] + // CHECK: %[[C64:.*]] = arith.constant 64 : index + // CHECK: %[[LX:.*]] = index.mul %[[IDX]], %[[C64]] + // CHECK: %[[C128:.*]] = arith.constant 128 : index + // CHECK: %[[OFFY:.*]] = index.remu %[[LY]], %[[C128]] + // CHECK: %[[C64_1:.*]] = arith.constant 64 : index + // CHECK: %[[OFFX:.*]] = index.remu %[[LX]], %[[C64_1]] + // CHECK: xegpu.create_nd_tdesc %[[ARG_0]][%[[OFFY]], %[[OFFX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>> gpu.return @@ -42,9 +39,7 @@ gpu.module @test_round_robin_assignment { gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) { %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-COUNT-4: xegpu.load_nd %{{.*}} - // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-SAME-COUNT-4: -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32> // CHECK-NOT: xegpu.load_nd %load = xegpu.load_nd %tdesc : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -57,9 +52,8 @@ gpu.module @test_round_robin_assignment { gpu.func @store_nd(%src: memref<256x128xf32>) { %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} - // CHECK-SAME-COUNT-4: : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-NOT : xegpu.store_nd + // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-NOT: xegpu.store_nd %load = xegpu.load_nd %tdesc : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<256x128xf32> @@ -73,8 +67,7 @@ gpu.module @test_round_robin_assignment { gpu.func @update_nd(%src: memref<256x128xf32>){ %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] - // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>> + // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NOT: xegpu.update_nd_offset %update = xegpu.update_nd_offset %tdesc, [0, 16] : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -84,15 +77,9 @@ gpu.module @test_round_robin_assignment { // CHECK-LABEL: dpas // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>) gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> - // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-NOT: xegpu.create_nd_tdesc - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> - // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>> - // CHECK-NOT: xegpu.create_nd_tdesc - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} - // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} - // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -113,8 +100,7 @@ gpu.module @test_round_robin_assignment { // CHECK-LABEL: prefetch_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} - // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NOT: xegpu.prefetch_nd %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -131,9 +117,7 @@ gpu.module @test_round_robin_assignment { %load = xegpu.load_nd %tdesc : !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>> -> vector<128x1xf32> - // CHECK-COUNT-2: vector.broadcast {{.*}} - // CHECK-SAME-COUNT-2: {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} - // CHECK-SAME-COUNT-2: : vector<16x1xf32> to vector<16x32xf32> + // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32> // CHECK-NOT: vector.broadcast %broadcast = vector.broadcast %load {layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>} @@ -171,10 +155,10 @@ gpu.module @test_round_robin_assignment { %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32> %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> - //CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32) + // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32) %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) { %4 = arith.cmpi slt, %arg3, %c10_i32 : i32 - //CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32 + // CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32 scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32 } do { // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32) @@ -195,16 +179,16 @@ gpu.module @test_round_robin_assignment { %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> %3 = arith.cmpi eq, %0, %c10 : index // CHECK-LABEL: scf.if - // CHECK-SAME: (vector<16xf32>, vector<16xf32>) + // CHECK-SAME: (vector<16xf32>, vector<16xf32>) %4 = scf.if %3 -> (vector<256xf32>) { %5 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32> // CHECK-LABEL: scf.yield - // CHECK-SAME: vector<16xf32>, vector<16xf32> + // CHECK-SAME: vector<16xf32>, vector<16xf32> scf.yield %5 : vector<256xf32> } else { %5 = xegpu.load_nd %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32> // CHECK-LABEL: scf.yield - // CHECK-SAME: vector<16xf32>, vector<16xf32> + // CHECK-SAME: vector<16xf32>, vector<16xf32> scf.yield %5 : vector<256xf32> } {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [16]>} xegpu.store_nd %4, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> @@ -220,16 +204,16 @@ gpu.module @test_round_robin_assignment { %0 = arith.cmpi eq, %id, %c10 : index // CHECK-LABEL: scf.if - // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>) + // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>) %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) { %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> // CHECK-LABEL: scf.yield - // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> + // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> } else { %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> // CHECK-LABEL: scf.yield - // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> + // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> } xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> @@ -238,8 +222,8 @@ gpu.module @test_round_robin_assignment { gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) { %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> - //CHECK-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32> - //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32> + // CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32> + // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32> %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32> %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>, target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir index 86a021b66949c..84ce80f477a55 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir @@ -14,13 +14,11 @@ gpu.module @test_distribution { // CHECK-LABEL: load_nd_tdesc_with_offset gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] - // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-SAME-COUNT-4: -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32> // CHECK-NOT: xegpu.load_nd %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<256x128xf32> gpu.return @@ -28,8 +26,7 @@ gpu.module @test_distribution { // CHECK-LABEL: store_nd_with_offset gpu.func @store_nd_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] - // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NOT: xegpu.store_nd %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -42,10 +39,8 @@ gpu.module @test_distribution { } // CHECK-LABEL: prefetch_nd_tdesc_with_offset - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] - // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NOT: xegpu.prefetch_nd %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -57,15 +52,11 @@ gpu.module @test_distribution { // CHECK-LABEL: dpas // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>) gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> - // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-NOT: xegpu.create_nd_tdesc - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> - // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>> - // CHECK-NOT: xegpu.create_nd_tdesc - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} - // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} - // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -102,27 +93,42 @@ gpu.module @test_distribution { gpu.func @non_splat_constant() { // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}}> : vector<2x1xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[MAP4:.*]] = affine.apply #map4()[%[[SGID]]] - // CHECK-DAG: %[[MAP5:.*]] = affine.apply #map5()[%[[SGID]]] - // CHECK-DAG: %[[MUL:.*]] = index.mul %[[MAP4]], %[[C2:.*]] - // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[MUL]], %[[C32:.*]] - // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[MAP5]], %[[C1:.*]] + // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[SGID]], %[[C1:.*]] + // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C1:.*]] + // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C8:.*]] + // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2:.*]] + // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C32:.*]] + // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C1:.*]] // CHECK-DAG: %[[ADD16:.*]] = arith.addi %[[MUL]], %[[C16:.*]] : index - // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[ADD16]], %[[C32:.*]] - // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[MAP5]], %[[C1:.*]] - // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU1]], %[[C16:.*]] : index + // CHECK-DAG: %[[REMU5:.*]] = index.remu %[[ADD16]], %[[C32:.*]] + // CHECK-DAG: %[[REMU6:.*]] = index.remu %[[REMU1]], %[[C1:.*]] + // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[STRIDE1]] : index - // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU2]], %[[C0:.*]] : index + // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index // CHECK-DAG: %[[ADDSTRIDES1:.*]] = arith.addi %[[ADDSTRIDES]], %[[STRIDE2]] : index // CHECK-DAG: %[[BCAST1:.*]] = vector.broadcast %[[ADDSTRIDES1]] : index to vector<2x1xindex> // CHECK-DAG: %[[RESULT1:.*]] = arith.addi %[[BASECST]], %[[BCAST1]] : vector<2x1xindex> - // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index + // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU5]], %[[C16:.*]] : index // CHECK-DAG: %[[ADDSTRIDES2:.*]] = arith.addi %[[C0:.*]], %[[STRIDE3]] : index - // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index + // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU6]], %[[C0:.*]] : index // CHECK-DAG: %[[ADDSTRIDES3:.*]] = arith.addi %[[ADDSTRIDES2]], %[[STRIDE4]] : index // CHECK-DAG: %[[BCAST2:.*]] = vector.broadcast %[[ADDSTRIDES3]] : index to vector<2x1xindex> // CHECK-DAG: %[[RESULT2:.*]] = arith.addi %[[BASECST]], %[[BCAST2]] : vector<2x1xindex> %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex> gpu.return } + + // CHECK-LABEL: vector_transpose + gpu.func @vector_transpose(%src: memref<256x128xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>> + -> vector<256x128xf32> + // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<32x16xf32> to vector<16x32xf32> + // CHECK-NOT: vector.transpose + %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x128xf32> to vector<128x256xf32> + gpu.return + } } + diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 8d98fcfd0d2c2..4fbb566cfbe73 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -1,8 +1,5 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s -//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)> -//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)> -//CHECK: #map2 = affine_map<()[s0] -> (s0 floordiv 8)> gpu.module @test_distribution { // CHECK-LABEL: create_nd_tdesc_no_offset // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> @@ -26,13 +23,23 @@ gpu.module @test_distribution { } // CHECK-LABEL: load_nd_tdesc_with_offset - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index - //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] - //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] - //CHECK: %[[LOAD:.*]] = xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32> - %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> + //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + //CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + //CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index + //CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %[[C4]] + //CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %[[C4]] + //CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index + //CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %[[C8]] + //CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index + //CHECK-DAG: %[[L_OFF_Y:.*]] = index.mul %[[SGIDY]], %[[C32]] + //CHECK-DAG: %[[L_OFF_X:.*]] = index.mul %[[SGIDX]], %[[C32]] + //CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index + //CHECK-DAG: %[[OFF_Y:.*]] = index.remu %[[L_OFF_Y]], %[[C256]] + //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index + //CHECK-DAG: %[[OFF_X:.*]] = index.remu %[[L_OFF_X]], %[[C128]] + //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> %load = xegpu.load_nd %tdesc[0, 0] : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -43,9 +50,6 @@ gpu.module @test_distribution { // CHECK-LABEL: store_nd_with_offsets // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @store_nd_with_offsets(%src: memref<256x128xf32>) { - //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index - //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] - //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -60,9 +64,6 @@ gpu.module @test_distribution { // CHECK-LABEL: prefetch_nd_tdesc_with_offset // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index - //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] - //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> %cst0 = arith.constant 0 : index %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> @@ -319,17 +320,15 @@ gpu.module @test_distribution { gpu.func @distribute_load_matrix(%arg0: memref<32768xi8, 3>) { //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index - //CHECK: [[c2:%.+]] = arith.constant 2 : index //CHECK: [[c4:%.+]] = arith.constant 4 : index - //CHECK: [[c4_0:%.+]] = arith.constant 4 : index - //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]] - //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]] + //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]] + //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]] + //CHECK: [[c2:%.+]] = arith.constant 2 : index + //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]] //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]] - //CHECK: [[c32_1:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]] - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[c0_1:%.+]] = arith.constant 0 : index + //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]] + //CHECK: [[c32_0:%.+]] = arith.constant 32 : index + //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]] //CHECK: [[c64:%.+]] = arith.constant 64 : index //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] //CHECK: [[c128:%.+]] = arith.constant 128 : index @@ -346,17 +345,15 @@ gpu.module @test_distribution { //CHECK: [[cst:%.+]] = arith.constant dense<1.000000e+00> : vector<32x32xf32> //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index - //CHECK: [[c2:%.+]] = arith.constant 2 : index //CHECK: [[c4:%.+]] = arith.constant 4 : index - //CHECK: [[c4_0:%.+]] = arith.constant 4 : index - //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]] - //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]] + //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]] + //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]] + //CHECK: [[c2:%.+]] = arith.constant 2 : index + //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]] //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]] - //CHECK: [[c32_1:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]] - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[c0_2:%.+]] = arith.constant 0 : index + //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]] + //CHECK: [[c32_0:%.+]] = arith.constant 32 : index + //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]] //CHECK: [[c64:%.+]] = arith.constant 64 : index //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] //CHECK: [[c128:%.+]] = arith.constant 128 : index @@ -411,14 +408,17 @@ gpu.module @test_distribution { // CHECK-LABEL: vector_step_op gpu.func @vector_step_op_slice_attr() { //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]] - //CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index - //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]] - //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index - //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index - //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]] - //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex> - //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> + //CHECK: [[c8:%.+]] = arith.constant 8 : index + //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c8]] + //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgId]], [[c8]] + //CHECK: [[c4:%.+]] = arith.constant 4 : index + //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c4]] + //CHECK: [[c32:%.+]] = arith.constant 32 : index + //CHECK: [[LY:%.+]] = index.mul [[sgidy]], [[c32]] + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[c128]] + //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> + //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex> gpu.return @@ -426,14 +426,14 @@ gpu.module @test_distribution { gpu.func @vector_step_op_layout_attr() { //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index - //CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index - //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]] - //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index - //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index - //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] - //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex> - //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex> + //CHECK: [[c16:%.+]] = arith.constant 16 : index + //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c16]] + //CHECK: [[c8:%.+]] = arith.constant 8 : index + //CHECK: [[LOCALY:%.+]] = index.mul [[sgidx]], [[c8]] + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] + //CHECK: [[BASE:%.+]] = vector.step : vector<8xindex> + //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex> %step = vector.step {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [8]>}: vector<128xindex> gpu.return @@ -464,14 +464,27 @@ gpu.module @test_distribution { gpu.return } + // CHECK-LABEL: vector_transpose + gpu.func @vector_transpose(%src: memref<256x32xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32> + -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>> + -> vector<256x32xf32> + //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<64x32xf32> to vector<32x64xf32> + %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x32xf32> to vector<32x256xf32> + gpu.return + } + // CHECK-LABEL: non_splat_constant_2D gpu.func @non_splat_constant_2D() { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x1xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: affine.apply #map4()[%[[SGID]]] - // CHECK-DAG: affine.apply #map5()[%[[SGID]]] - // CHECK-DAG: %[[IDY:.*]] = index.remu %{{.*}}, %[[C32:.*]] - // CHECK-DAG: %[[IDX:.*]] = index.remu %{{.*}}, %[[C1:.*]] + // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}} + // CHECK-DAG: %[[IDY:.*]] = index.remu %[[SGIDY]], %{{.*}} + // CHECK-DAG: %[[IDX:.*]] = index.remu %[[SGIDX]], %{{.*}} // CHECK-DAG: %[[STRIDECOL:.*]] = arith.muli %[[IDY]], %[[C16:.*]] : index // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[STRIDECOL]] : index // CHECK-DAG: %[[STRIDEROW:.*]] = arith.muli %[[IDX]], %[[C0:.*]] : index @@ -484,20 +497,19 @@ gpu.module @test_distribution { // CHECK-LABEL: non_splat_constant_2D_non_unit_dim gpu.func @non_splat_constant_2D_non_unit_dim() { - // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}} : vector<2x2xindex> + // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{\[}}{{\[}}0, 16{{\]}}, {{\[}}8, 24{{\]}}{{\]}}> : vector<2x2xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[IDY:.*]] = affine.apply #map()[%[[SGID]]] - // CHECK-DAG: %[[IDX:.*]] = affine.apply #map1()[%[[SGID]]] - // CHECK-DAG: %[[MULY:.*]] = index.mul %[[IDY]], %[[C2:.*]] - // CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index - // CHECK-DAG: %[[MULX:.*]] = index.mul %[[IDX]], %[[C2:.*]] + // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}} + // CHECK-DAG: %[[MULY:.*]] = index.mul %[[SGIDY]], %[[C2:.*]] + // CHECK-DAG: %[[MULX:.*]] = index.mul %[[SGIDX]], %{{.*}} // CHECK-DAG: %[[REMU_Y:.*]] = index.remu %[[MULY]], %[[C8:.*]] - // CHECK-DAG: %[[C8_2:.*]] = arith.constant 8 : index - // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %[[C8:.*]] - // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %[[C8:.*]] : index + // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %{{.*}} + // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %{{.*}} : index // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[MUL5]] : index // CHECK-DAG: %[[MUL6:.*]] = arith.muli %[[REMU_X]], %[[C16:.*]] : index - // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index + // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<2x2xindex> // CHECK-DAG: %[[ADDCST:.*]] = arith.addi %[[BASECST]], %[[BCAST]] : vector<2x2xindex> %cst_8x8 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>} dense<[ @@ -517,13 +529,14 @@ gpu.module @test_distribution { gpu.func @non_splat_constant() { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %[[C32:.*]] - // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] : index + // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %{{.*}} + // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[REMU]], %{{.*}} + // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C16:.*]] : index // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex> // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[CST]], %[[BCAST]] : vector<1xindex> %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [1]>} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex> - // CHECK: arith.constant dense<{{\[}}[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]{{\]}}> : vector<1x16xindex> + // CHECK: arith.constant dense<{{\[}}{{\[}}0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15{{\]}}{{\]}}> : vector<1x16xindex> %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index e83229e3a3995..5ce3d1d0fb5d6 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -1,47 +1,35 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s -//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)> -//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)> gpu.module @test_1_1_assignment { // CHECK-LABEL: create_nd_tdesc - // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32> + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) { - //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index - //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] - //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] - //CHECK: [[C32:%.+]] = arith.constant 32 : index - //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]] - //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]] - //CHECK: [[C0:%.+]] = arith.constant 0 : index - //CHECK: [[C0_1:%.+]] = arith.constant 0 : index - //CHECK: [[C256:%.+]] = arith.constant 256 : index - //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]] - //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]] - //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]] + // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]] + // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]] + // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]] + // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]] + // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[MODY]], %[[MODX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> gpu.return } // CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref - // CHECK-SAME: [[ARG_0:%.*]]: memref<3x256x128xf32> + // CHECK-SAME: %[[ARG_0:.*]]: memref<3x256x128xf32> gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) { - //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index - //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] - //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] - //CHECK: [[C32:%.+]] = arith.constant 32 : index - //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]] - //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]] - //CHECK: [[C0:%.+]] = arith.constant 0 : index - //CHECK: [[C0_2:%.+]] = arith.constant 0 : index - //CHECK: [[C256:%.+]] = arith.constant 256 : index - //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]] - //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]] - //CHECK: [[C0_3:%.+]] = arith.constant 0 : index - //CHECK: [[C0_4:%.+]] = arith.constant 0 : index - //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]] + // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]] + // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]] + // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]] + // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]] + // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][1, %[[MODY]], %[[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> gpu.return @@ -81,25 +69,24 @@ gpu.module @test_1_1_assignment { xegpu.store_nd %load, %tdesc : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> gpu.return -} + } -// CHECK-LABEL: update_nd -// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> -gpu.func @update_nd(%src: memref<256x128xf32>){ - // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16] - // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> - %update = xegpu.update_nd_offset %tdesc, [0, 16] - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return -} + // CHECK-LABEL: update_nd + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> + gpu.func @update_nd(%src: memref<256x128xf32>){ + // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32> + // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16] + // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> + %update = xegpu.update_nd_offset %tdesc, [0, 16] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> + gpu.return + } -// CHECK-LABEL: dpas -gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + // CHECK-LABEL: dpas + gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>> %load_a = xegpu.load_nd %tdesc_a @@ -110,16 +97,15 @@ gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %load_b = xegpu.load_nd %tdesc_b : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<128x128xf16> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } - -// CHECK-LABEL: dpas_no_sg_data -gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-LABEL: dpas_no_sg_data + gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>> @@ -134,6 +120,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>> -> vector<128x128xf16> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> @@ -196,9 +183,9 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { } gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) { - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[c1024:%.+]] = arith.constant 1024 : index + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index + // CHECK-DAG: %[[C1024:.*]] = arith.constant 1024 : index %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index %c1024 = arith.constant 1024 : index @@ -211,15 +198,15 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> - // CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]] - // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) -> + // CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]] + // CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) -> // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>) - // CHECK: [[a:%.+]] = xegpu.load_nd [[arg4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> - // CHECK: [[b:%.+]] = xegpu.load_nd [[arg5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> - // CHECK: [[c:%.+]] = xegpu.dpas [[a]], [[b]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32> - // CHECK: [[at:%.+]] = xegpu.update_nd_offset [[arg4]], [[[c0]], [[c128]]] : !xegpu.tensor_desc<16x128xf16> - // CHECK: [[bt:%.+]] = xegpu.update_nd_offset [[arg5]], [[[c128]], [[c0]]] : !xegpu.tensor_desc<128x16xf16> - // CHECK: scf.yield [[at]], [[bt]], [[c]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32> + // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> + // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> + // CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32> + // CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16> + // CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16> + // CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32> %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) { @@ -252,7 +239,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { // CHECK: scf.condition{{.*}} : vector<16xf32>, i32 scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32 } do { - // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: i32) + // CHECK: (%[[ARG2:.*]]: vector<16xf32>, %[[ARG3:.*]]: i32) ^bb0(%arg2: vector<256xf32>, %arg3: i32): xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> %4 = arith.addi %arg3, %c1_i32 : i32 @@ -344,9 +331,9 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %cond4 = arith.cmpi slt, %sg_id, %c31 : index %cond5 = arith.andi %cond3, %cond4 : i1 scf.if %cond5 { - // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK: %[[C2:.*]] = arith.constant 2 : index - // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] + // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK: %[[C2:.*]] = arith.constant 2 : index + // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] %tdesc = xegpu.create_nd_tdesc %src2[0, 0] : memref<128x64xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>> %load = xegpu.load_nd %tdesc From c5aaee0bb07b221e5d3314bbdcf1abc4a604d6bd Mon Sep 17 00:00:00 2001 From: Kewen Meng <Kewen.Meng@amd.com> Date: Tue, 4 Nov 2025 19:43:16 -0800 Subject: [PATCH 273/313] Revert commit d8e5698 and 15b19c7 (#166498) --- libc/src/stdio/CMakeLists.txt | 24 --------- libc/src/stdio/asprintf.cpp | 18 +------ libc/src/stdio/baremetal/CMakeLists.txt | 8 --- libc/src/stdio/baremetal/printf.cpp | 23 ++------ libc/src/stdio/baremetal/vprintf.cpp | 23 ++------ libc/src/stdio/generic/CMakeLists.txt | 4 -- libc/src/stdio/generic/fprintf.cpp | 17 +----- libc/src/stdio/generic/printf.cpp | 17 +----- libc/src/stdio/generic/vfprintf.cpp | 17 +----- libc/src/stdio/generic/vprintf.cpp | 17 +----- libc/src/stdio/printf_core/CMakeLists.txt | 25 --------- libc/src/stdio/printf_core/core_structs.h | 19 +++---- libc/src/stdio/printf_core/error_mapper.h | 21 -------- .../stdio/printf_core/generic/CMakeLists.txt | 8 --- .../stdio/printf_core/generic/error_mapper.h | 49 ----------------- .../stdio/printf_core/linux/CMakeLists.txt | 8 --- .../stdio/printf_core/linux/error_mapper.h | 54 ------------------- libc/src/stdio/printf_core/printf_main.h | 9 ++-- .../stdio/printf_core/vasprintf_internal.h | 20 ++++--- .../src/stdio/printf_core/vfprintf_internal.h | 41 +++++--------- .../stdio/printf_core/write_int_converter.h | 4 +- libc/src/stdio/printf_core/writer.h | 8 +-- libc/src/stdio/snprintf.cpp | 19 +------ libc/src/stdio/sprintf.cpp | 18 +------ libc/src/stdio/vasprintf.cpp | 16 +----- libc/src/stdio/vsnprintf.cpp | 19 +------ libc/src/stdio/vsprintf.cpp | 17 +----- libc/src/stdlib/CMakeLists.txt | 6 --- libc/src/stdlib/strfromd.cpp | 11 +--- libc/src/stdlib/strfromf.cpp | 11 +--- libc/src/stdlib/strfroml.cpp | 11 +--- libc/src/time/strftime_core/strftime_main.h | 3 +- libc/test/src/stdio/CMakeLists.txt | 2 - libc/test/src/stdio/fprintf_test.cpp | 24 --------- .../src/stdio/printf_core/converter_test.cpp | 30 +++++------ .../src/stdio/printf_core/writer_test.cpp | 32 +++++------ libc/test/src/stdio/snprintf_test.cpp | 15 ------ libc/test/src/stdio/vfprintf_test.cpp | 5 -- libc/test/src/stdlib/StrfromTest.h | 19 +------ 39 files changed, 104 insertions(+), 588 deletions(-) delete mode 100644 libc/src/stdio/printf_core/error_mapper.h delete mode 100644 libc/src/stdio/printf_core/generic/CMakeLists.txt delete mode 100644 libc/src/stdio/printf_core/generic/error_mapper.h delete mode 100644 libc/src/stdio/printf_core/linux/CMakeLists.txt delete mode 100644 libc/src/stdio/printf_core/linux/error_mapper.h diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt index c75c8b11be2b5..b0a6ef1e291b5 100644 --- a/libc/src/stdio/CMakeLists.txt +++ b/libc/src/stdio/CMakeLists.txt @@ -125,10 +125,6 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -140,10 +136,6 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -154,10 +146,6 @@ add_entrypoint_object( asprintf.h DEPENDS libc.src.stdio.printf_core.vasprintf_internal - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -169,10 +157,6 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -184,10 +168,6 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -198,10 +178,6 @@ add_entrypoint_object( vasprintf.h DEPENDS libc.src.stdio.printf_core.vasprintf_internal - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_subdirectory(printf_core) diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp index 0991dfca6a059..f8cfb74ce48ea 100644 --- a/libc/src/stdio/asprintf.cpp +++ b/libc/src/stdio/asprintf.cpp @@ -7,12 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/stdio/asprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vasprintf_internal.h" namespace LIBC_NAMESPACE_DECL { @@ -26,18 +22,8 @@ LLVM_LIBC_FUNCTION(int, asprintf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - auto ret_val = printf_core::vasprintf_internal(buffer, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + int ret = printf_core::vasprintf_internal(buffer, format, args); + return ret; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt index bfeff0e2b5880..548938f885c94 100644 --- a/libc/src/stdio/baremetal/CMakeLists.txt +++ b/libc/src/stdio/baremetal/CMakeLists.txt @@ -29,12 +29,8 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.error_mapper - libc.src.stdio.printf_core.core_structs libc.src.__support.arg_list libc.src.__support.OSUtil.osutil - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( @@ -91,12 +87,8 @@ add_entrypoint_object( DEPENDS libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer - libc.src.stdio.printf_core.error_mapper - libc.src.stdio.printf_core.core_structs libc.src.__support.arg_list libc.src.__support.OSUtil.osutil - libc.src.__support.libc_errno - libc.src.__support.CPP.limits ) add_entrypoint_object( diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp index 5a9b19ff20471..7253c6549a4e4 100644 --- a/libc/src/stdio/baremetal/printf.cpp +++ b/libc/src/stdio/baremetal/printf.cpp @@ -7,13 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdio/printf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/io.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -45,25 +42,13 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) { buffer, BUFF_SIZE, &stdout_write_hook, nullptr); printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb); - auto retval = printf_core::printf_main(&writer, format, args); - if (!retval.has_value()) { - libc_errno = printf_core::internal_error_to_errno(retval.error()); - return -1; - } + int retval = printf_core::printf_main(&writer, format, args); int flushval = wb.overflow_write(""); - if (flushval != printf_core::WRITE_OK) { - libc_errno = printf_core::internal_error_to_errno(-flushval); - return -1; - } + if (flushval != printf_core::WRITE_OK) + retval = flushval; - if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(retval.value()); + return retval; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp index c172b368d15f3..ab02533f14911 100644 --- a/libc/src/stdio/baremetal/vprintf.cpp +++ b/libc/src/stdio/baremetal/vprintf.cpp @@ -7,13 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/stdio/vprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/io.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -43,25 +40,13 @@ LLVM_LIBC_FUNCTION(int, vprintf, buffer, BUFF_SIZE, &stdout_write_hook, nullptr); printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb); - auto retval = printf_core::printf_main(&writer, format, args); - if (!retval.has_value()) { - libc_errno = printf_core::internal_error_to_errno(retval.error()); - return -1; - } + int retval = printf_core::printf_main(&writer, format, args); int flushval = wb.overflow_write(""); - if (flushval != printf_core::WRITE_OK) { - libc_errno = printf_core::internal_error_to_errno(-flushval); - return -1; - } + if (flushval != printf_core::WRITE_OK) + retval = flushval; - if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(retval.value()); + return retval; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt index 71055edea3d9e..6361822b61999 100644 --- a/libc/src/stdio/generic/CMakeLists.txt +++ b/libc/src/stdio/generic/CMakeLists.txt @@ -393,11 +393,7 @@ add_generic_entrypoint_object( list(APPEND fprintf_deps libc.hdr.types.FILE libc.src.__support.arg_list - libc.src.__support.CPP.limits - libc.src.__support.libc_errno libc.src.stdio.printf_core.vfprintf_internal - libc.src.stdio.printf_core.core_structs - libc.src.stdio.printf_core.error_mapper ) if(LLVM_LIBC_FULL_BUILD) diff --git a/libc/src/stdio/generic/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp index b2033901557a0..087aeadfc52c5 100644 --- a/libc/src/stdio/generic/fprintf.cpp +++ b/libc/src/stdio/generic/fprintf.cpp @@ -8,12 +8,9 @@ #include "src/stdio/fprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -30,18 +27,8 @@ LLVM_LIBC_FUNCTION(int, fprintf, // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - auto ret_val = printf_core::vfprintf_internal(stream, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + int ret_val = printf_core::vfprintf_internal(stream, format, args); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/printf.cpp b/libc/src/stdio/generic/printf.cpp index 8d159d5c70870..bb7c7c86f843f 100644 --- a/libc/src/stdio/generic/printf.cpp +++ b/libc/src/stdio/generic/printf.cpp @@ -8,12 +8,9 @@ #include "src/stdio/printf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -34,19 +31,9 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) { // and pointer semantics, as well as handling // destruction automatically. va_end(vlist); - auto ret_val = printf_core::vfprintf_internal( + int ret_val = printf_core::vfprintf_internal( reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp index a26f082ed9347..01f4265f118a6 100644 --- a/libc/src/stdio/generic/vfprintf.cpp +++ b/libc/src/stdio/generic/vfprintf.cpp @@ -8,12 +8,9 @@ #include "src/stdio/vfprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -27,18 +24,8 @@ LLVM_LIBC_FUNCTION(int, vfprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - auto ret_val = printf_core::vfprintf_internal(stream, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + int ret_val = printf_core::vfprintf_internal(stream, format, args); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/generic/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp index ae2160219f2bb..08d71515646ed 100644 --- a/libc/src/stdio/generic/vprintf.cpp +++ b/libc/src/stdio/generic/vprintf.cpp @@ -8,12 +8,9 @@ #include "src/stdio/vprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/File/file.h" #include "src/__support/arg_list.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vfprintf_internal.h" #include "hdr/types/FILE.h" @@ -32,19 +29,9 @@ LLVM_LIBC_FUNCTION(int, vprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - auto ret_val = printf_core::vfprintf_internal( + int ret_val = printf_core::vfprintf_internal( reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt index 624129b2b36e7..ee66145e60156 100644 --- a/libc/src/stdio/printf_core/CMakeLists.txt +++ b/libc/src/stdio/printf_core/CMakeLists.txt @@ -32,17 +32,6 @@ if(printf_config_copts) list(PREPEND printf_config_copts "COMPILE_OPTIONS") endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) - add_subdirectory(${LIBC_TARGET_OS}) -else() - add_subdirectory(generic) -endif() - -set(target_error_mapper libc.src.stdio.printf_core.${LIBC_TARGET_OS}.error_mapper) -if(NOT TARGET ${target_error_mapper}) - set(target_error_mapper libc.src.stdio.printf_core.generic.error_mapper) -endif() - add_header_library( printf_config HDRS @@ -58,7 +47,6 @@ add_header_library( libc.include.inttypes libc.src.__support.CPP.string_view libc.src.__support.FPUtil.fp_bits - libc.hdr.errno_macros ) add_header_library( @@ -137,7 +125,6 @@ add_header_library( .writer .core_structs libc.src.__support.arg_list - libc.src.__support.error_or ) add_header_library( @@ -149,20 +136,10 @@ add_header_library( libc.hdr.func.free libc.hdr.func.realloc libc.src.__support.arg_list - libc.src.__support.error_or libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer ) -add_header_library( - error_mapper - HDRS - error_mapper.h - DEPENDS - ${target_error_mapper} - libc.src.__support.macros.properties.architectures -) - if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD) # Not all platforms have a file implementation. If file is unvailable, and a # full build is requested, then we must skip all file based printf sections. @@ -175,10 +152,8 @@ add_header_library( vfprintf_internal.h DEPENDS libc.src.__support.File.file - libc.src.__support.error_or libc.src.__support.arg_list libc.src.stdio.printf_core.printf_main libc.src.stdio.printf_core.writer ${use_system_file} ) - diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h index 0d41f2244d8da..e27f77b6b594a 100644 --- a/libc/src/stdio/printf_core/core_structs.h +++ b/libc/src/stdio/printf_core/core_structs.h @@ -132,17 +132,14 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() { // This is the value to be returned by conversions when no error has occurred. constexpr int WRITE_OK = 0; -// These are the error return values used by the printf engine when an -// error has occurred. They are all large negative, distinct values starting -// from -1000 to not overlap with system errors. -constexpr int FILE_WRITE_ERROR = -1001; -constexpr int FILE_STATUS_ERROR = -1002; -constexpr int NULLPTR_WRITE_ERROR = -1003; -constexpr int INT_CONVERSION_ERROR = -1004; -constexpr int FIXED_POINT_CONVERSION_ERROR = -1005; -constexpr int ALLOCATION_ERROR = -1006; -constexpr int OVERFLOW_ERROR = -1007; - +// These are the printf return values for when an error has occurred. They are +// all negative, and should be distinct. +constexpr int FILE_WRITE_ERROR = -1; +constexpr int FILE_STATUS_ERROR = -2; +constexpr int NULLPTR_WRITE_ERROR = -3; +constexpr int INT_CONVERSION_ERROR = -4; +constexpr int FIXED_POINT_CONVERSION_ERROR = -5; +constexpr int ALLOCATION_ERROR = -6; } // namespace printf_core } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/printf_core/error_mapper.h b/libc/src/stdio/printf_core/error_mapper.h deleted file mode 100644 index 23030930133a1..0000000000000 --- a/libc/src/stdio/printf_core/error_mapper.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Error mapper for printf ---------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H -#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H - -#include "src/__support/macros/properties/architectures.h" - -// Maps internal errors to the available errnos on the platform. -#if defined(__linux__) -#include "linux/error_mapper.h" -#else -#include "generic/error_mapper.h" -#endif - -#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/generic/CMakeLists.txt b/libc/src/stdio/printf_core/generic/CMakeLists.txt deleted file mode 100644 index 2f0143d992e31..0000000000000 --- a/libc/src/stdio/printf_core/generic/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -add_header_library( - error_mapper - HDRS - error_mapper.h - DEPENDS - libc.src.stdio.printf_core.core_structs - libc.hdr.errno_macros -) diff --git a/libc/src/stdio/printf_core/generic/error_mapper.h b/libc/src/stdio/printf_core/generic/error_mapper.h deleted file mode 100644 index d8cdd2cc2dbaa..0000000000000 --- a/libc/src/stdio/printf_core/generic/error_mapper.h +++ /dev/null @@ -1,49 +0,0 @@ -//===-- Generic implementation of error mapper ------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H -#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H - -#include "hdr/errno_macros.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" - -namespace LIBC_NAMESPACE_DECL { -namespace printf_core { - -LIBC_INLINE static int internal_error_to_errno(int internal_error) { - // System error occured, return error as is. - if (internal_error < 1001 && internal_error > 0) { - return internal_error; - } - - // Map internal error to the available C standard errnos. - switch (-internal_error) { - case WRITE_OK: - return 0; - case FILE_WRITE_ERROR: - case FILE_STATUS_ERROR: - case NULLPTR_WRITE_ERROR: - case ALLOCATION_ERROR: - return EDOM; - case INT_CONVERSION_ERROR: - case FIXED_POINT_CONVERSION_ERROR: - case OVERFLOW_ERROR: - return ERANGE; - default: - LIBC_ASSERT( - false && - "Invalid internal printf error code passed to internal_error_to_errno"); - return EDOM; - } -} - -} // namespace printf_core -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/linux/CMakeLists.txt b/libc/src/stdio/printf_core/linux/CMakeLists.txt deleted file mode 100644 index 2f0143d992e31..0000000000000 --- a/libc/src/stdio/printf_core/linux/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -add_header_library( - error_mapper - HDRS - error_mapper.h - DEPENDS - libc.src.stdio.printf_core.core_structs - libc.hdr.errno_macros -) diff --git a/libc/src/stdio/printf_core/linux/error_mapper.h b/libc/src/stdio/printf_core/linux/error_mapper.h deleted file mode 100644 index 3c2fe663072d0..0000000000000 --- a/libc/src/stdio/printf_core/linux/error_mapper.h +++ /dev/null @@ -1,54 +0,0 @@ -//===-- Linux implementation of error mapper --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H -#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H - -#include "hdr/errno_macros.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" - -namespace LIBC_NAMESPACE_DECL { -namespace printf_core { - -LIBC_INLINE static int internal_error_to_errno(int internal_error) { - // System error occured, return error as is. - if (internal_error < 1001 && internal_error > 0) { - return internal_error; - } - - // Map internal error to POSIX errnos. - switch (-internal_error) { - case WRITE_OK: - return 0; - case FILE_WRITE_ERROR: - return EIO; - case FILE_STATUS_ERROR: - return EIO; - case NULLPTR_WRITE_ERROR: - return EINVAL; - case INT_CONVERSION_ERROR: - return ERANGE; - case FIXED_POINT_CONVERSION_ERROR: - return EINVAL; - case ALLOCATION_ERROR: - return ENOMEM; - case OVERFLOW_ERROR: - return EOVERFLOW; - default: - LIBC_ASSERT( - false && - "Invalid internal printf error code passed to internal_error_to_errno"); - return EINVAL; - } -} - -} // namespace printf_core -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H diff --git a/libc/src/stdio/printf_core/printf_main.h b/libc/src/stdio/printf_core/printf_main.h index 1c7a7237c097d..57f29858d5298 100644 --- a/libc/src/stdio/printf_core/printf_main.h +++ b/libc/src/stdio/printf_core/printf_main.h @@ -10,7 +10,6 @@ #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PRINTF_MAIN_H #include "src/__support/arg_list.h" -#include "src/__support/error_or.h" #include "src/__support/macros/config.h" #include "src/stdio/printf_core/converter.h" #include "src/stdio/printf_core/core_structs.h" @@ -23,9 +22,8 @@ namespace LIBC_NAMESPACE_DECL { namespace printf_core { template <WriteMode write_mode> -ErrorOr<size_t> printf_main(Writer<write_mode> *writer, - const char *__restrict str, - internal::ArgList &args) { +int printf_main(Writer<write_mode> *writer, const char *__restrict str, + internal::ArgList &args) { Parser<internal::ArgList> parser(str, args); int result = 0; for (FormatSection cur_section = parser.get_next_section(); @@ -35,8 +33,9 @@ ErrorOr<size_t> printf_main(Writer<write_mode> *writer, result = convert(writer, cur_section); else result = writer->write(cur_section.raw_string); + if (result < 0) - return Error(-result); + return result; } return writer->get_chars_written(); diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h index 41df17b67f35b..283d8df2810fb 100644 --- a/libc/src/stdio/printf_core/vasprintf_internal.h +++ b/libc/src/stdio/printf_core/vasprintf_internal.h @@ -10,7 +10,6 @@ #include "hdr/func/malloc.h" #include "hdr/func/realloc.h" #include "src/__support/arg_list.h" -#include "src/__support/error_or.h" #include "src/stdio/printf_core/core_structs.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -30,7 +29,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { if (new_buff == nullptr) { if (wb->buff != wb->init_buff) free(wb->buff); - return ALLOCATION_ERROR; + return printf_core::ALLOCATION_ERROR; } if (isBuffOnStack) inline_memcpy(new_buff, wb->buff, wb->buff_cur); @@ -43,28 +42,27 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) { constexpr size_t DEFAULT_BUFFER_SIZE = 200; -LIBC_INLINE ErrorOr<size_t> vasprintf_internal(char **ret, - const char *__restrict format, - internal::ArgList args) { +LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format, + internal::ArgList args) { char init_buff_on_stack[DEFAULT_BUFFER_SIZE]; printf_core::WriteBuffer<Mode<WriteMode::RESIZE_AND_FILL_BUFF>::value> wb( init_buff_on_stack, DEFAULT_BUFFER_SIZE, resize_overflow_hook); printf_core::Writer writer(wb); auto ret_val = printf_core::printf_main(&writer, format, args); - if (!ret_val.has_value()) { + if (ret_val < 0) { *ret = nullptr; - return ret_val; + return -1; } if (wb.buff == init_buff_on_stack) { - *ret = static_cast<char *>(malloc(ret_val.value() + 1)); + *ret = static_cast<char *>(malloc(ret_val + 1)); if (ret == nullptr) - return Error(ALLOCATION_ERROR); - inline_memcpy(*ret, wb.buff, ret_val.value()); + return printf_core::ALLOCATION_ERROR; + inline_memcpy(*ret, wb.buff, ret_val); } else { *ret = wb.buff; } - (*ret)[ret_val.value()] = '\0'; + (*ret)[ret_val] = '\0'; return ret_val; } } // namespace printf_core diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h index 564441d3bf51a..630de9d9d43dd 100644 --- a/libc/src/stdio/printf_core/vfprintf_internal.h +++ b/libc/src/stdio/printf_core/vfprintf_internal.h @@ -11,7 +11,6 @@ #include "src/__support/File/file.h" #include "src/__support/arg_list.h" -#include "src/__support/error_or.h" #include "src/__support/macros/attributes.h" // For LIBC_INLINE #include "src/__support/macros/config.h" #include "src/stdio/printf_core/core_structs.h" @@ -36,8 +35,8 @@ LIBC_INLINE void funlockfile(FILE *f) { reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock(); } -LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, - size_t nmemb, FILE *f) { +LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, + FILE *f) { return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->write_unlocked( ptr, size * nmemb); } @@ -48,11 +47,9 @@ LIBC_INLINE void flockfile(::FILE *f) { ::flockfile(f); } LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); } -LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, - size_t nmemb, ::FILE *f) { - // Need to use system errno in this case, as system write will set this errno - // which we need to propagate back into our code. - return {::fwrite_unlocked(ptr, size, nmemb, f), errno}; +LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, + ::FILE *f) { + return ::fwrite_unlocked(ptr, size, nmemb, f); } #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace internal @@ -63,38 +60,26 @@ LIBC_INLINE int file_write_hook(cpp::string_view new_str, void *fp) { ::FILE *target_file = reinterpret_cast<::FILE *>(fp); // Write new_str to the target file. The logic preventing a zero-length write // is in the writer, so we don't check here. - auto write_result = internal::fwrite_unlocked(new_str.data(), sizeof(char), - new_str.size(), target_file); - // Propagate actual system error in FileIOResult. - if (write_result.has_error()) - return -write_result.error; - - // In case short write occured or error was not set on FileIOResult for some - // reason. - if (write_result.value != new_str.size() || - internal::ferror_unlocked(target_file)) + size_t written = internal::fwrite_unlocked(new_str.data(), sizeof(char), + new_str.size(), target_file); + if (written != new_str.size() || internal::ferror_unlocked(target_file)) return FILE_WRITE_ERROR; - return WRITE_OK; } -LIBC_INLINE ErrorOr<size_t> vfprintf_internal(::FILE *__restrict stream, - const char *__restrict format, - internal::ArgList &args) { +LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream, + const char *__restrict format, + internal::ArgList &args) { constexpr size_t BUFF_SIZE = 1024; char buffer[BUFF_SIZE]; printf_core::WriteBuffer<Mode<WriteMode::FLUSH_TO_STREAM>::value> wb( buffer, BUFF_SIZE, &file_write_hook, reinterpret_cast<void *>(stream)); Writer writer(wb); internal::flockfile(stream); - auto retval = printf_main(&writer, format, args); - if (!retval.has_value()) { - internal::funlockfile(stream); - return retval; - } + int retval = printf_main(&writer, format, args); int flushval = wb.overflow_write(""); if (flushval != WRITE_OK) - retval = Error(-flushval); + retval = flushval; internal::funlockfile(stream); return retval; } diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h index 04b2bef05bc7b..efcff278bd284 100644 --- a/libc/src/stdio/printf_core/write_int_converter.h +++ b/libc/src/stdio/printf_core/write_int_converter.h @@ -29,11 +29,11 @@ LIBC_INLINE int convert_write_int(Writer<write_mode> *writer, return NULLPTR_WRITE_ERROR; #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS - size_t written = writer->get_chars_written(); + int written = writer->get_chars_written(); switch (to_conv.length_modifier) { case LengthModifier::none: - *reinterpret_cast<int *>(to_conv.conv_val_ptr) = static_cast<int>(written); + *reinterpret_cast<int *>(to_conv.conv_val_ptr) = written; break; case LengthModifier::l: *reinterpret_cast<long *>(to_conv.conv_val_ptr) = written; diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h index 9de108ece510f..1d4734a51b9b8 100644 --- a/libc/src/stdio/printf_core/writer.h +++ b/libc/src/stdio/printf_core/writer.h @@ -127,7 +127,7 @@ template <WriteMode write_mode> struct WriteBuffer { template <WriteMode write_mode> class Writer final { WriteBuffer<write_mode> &wb; - size_t chars_written = 0; + int chars_written = 0; LIBC_INLINE int pad(char new_char, size_t length) { // First, fill as much of the buffer as possible with the padding char. @@ -161,7 +161,7 @@ template <WriteMode write_mode> class Writer final { // Takes a string, copies it into the buffer if there is space, else passes it // to the overflow mechanism to be handled separately. LIBC_INLINE int write(cpp::string_view new_string) { - chars_written += new_string.size(); + chars_written += static_cast<int>(new_string.size()); if (LIBC_LIKELY(wb.buff_cur + new_string.size() <= wb.buff_len)) { inline_memcpy(wb.buff + wb.buff_cur, new_string.data(), new_string.size()); @@ -175,7 +175,7 @@ template <WriteMode write_mode> class Writer final { // if there is space, else calls pad which will loop and call the overflow // mechanism on a secondary buffer. LIBC_INLINE int write(char new_char, size_t length) { - chars_written += length; + chars_written += static_cast<int>(length); if (LIBC_LIKELY(wb.buff_cur + length <= wb.buff_len)) { inline_memset(wb.buff + wb.buff_cur, static_cast<unsigned char>(new_char), @@ -199,7 +199,7 @@ template <WriteMode write_mode> class Writer final { return wb.overflow_write(char_string_view); } - LIBC_INLINE size_t get_chars_written() { return chars_written; } + LIBC_INLINE int get_chars_written() { return chars_written; } }; // Class-template auto deduction helpers. diff --git a/libc/src/stdio/snprintf.cpp b/libc/src/stdio/snprintf.cpp index d95195f6f485f..c8940862f711f 100644 --- a/libc/src/stdio/snprintf.cpp +++ b/libc/src/stdio/snprintf.cpp @@ -8,12 +8,8 @@ #include "src/stdio/snprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -36,21 +32,10 @@ LLVM_LIBC_FUNCTION(int, snprintf, wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - auto ret_val = printf_core::printf_main(&writer, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } + int ret_val = printf_core::printf_main(&writer, format, args); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - - if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/sprintf.cpp b/libc/src/stdio/sprintf.cpp index 2a9b6ea7c5e50..7be97d3591aaf 100644 --- a/libc/src/stdio/sprintf.cpp +++ b/libc/src/stdio/sprintf.cpp @@ -10,10 +10,7 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -36,20 +33,9 @@ LLVM_LIBC_FUNCTION(int, sprintf, wb(buffer, cpp::numeric_limits<size_t>::max()); printf_core::Writer writer(wb); - auto ret_val = printf_core::printf_main(&writer, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } + int ret_val = printf_core::printf_main(&writer, format, args); wb.buff[wb.buff_cur] = '\0'; - - if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp index bd77cd8864312..4a44d4a0f8842 100644 --- a/libc/src/stdio/vasprintf.cpp +++ b/libc/src/stdio/vasprintf.cpp @@ -7,11 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/stdio/vasprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/vasprintf_internal.h" namespace LIBC_NAMESPACE_DECL { @@ -22,17 +18,7 @@ LLVM_LIBC_FUNCTION(int, vasprintf, internal::ArgList args(vlist); // This holder class allows for easier copying // and pointer semantics, as well as handling // destruction automatically. - auto ret_val = printf_core::vasprintf_internal(ret, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } - if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - return static_cast<int>(ret_val.value()); + return printf_core::vasprintf_internal(ret, format, args); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vsnprintf.cpp b/libc/src/stdio/vsnprintf.cpp index 5d936360c0857..b07a2499a0dd3 100644 --- a/libc/src/stdio/vsnprintf.cpp +++ b/libc/src/stdio/vsnprintf.cpp @@ -8,12 +8,8 @@ #include "src/stdio/vsnprintf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -33,21 +29,10 @@ LLVM_LIBC_FUNCTION(int, vsnprintf, wb(buffer, (buffsz > 0 ? buffsz - 1 : 0)); printf_core::Writer writer(wb); - auto ret_val = printf_core::printf_main(&writer, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } + int ret_val = printf_core::printf_main(&writer, format, args); if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer. wb.buff[wb.buff_cur] = '\0'; - - if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdio/vsprintf.cpp b/libc/src/stdio/vsprintf.cpp index f9cf8118534f6..26d497be42125 100644 --- a/libc/src/stdio/vsprintf.cpp +++ b/libc/src/stdio/vsprintf.cpp @@ -10,10 +10,7 @@ #include "src/__support/CPP/limits.h" #include "src/__support/arg_list.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdio/printf_core/printf_main.h" #include "src/stdio/printf_core/writer.h" @@ -33,19 +30,9 @@ LLVM_LIBC_FUNCTION(int, vsprintf, wb(buffer, cpp::numeric_limits<size_t>::max()); printf_core::Writer writer(wb); - auto ret_val = printf_core::printf_main(&writer, format, args); - if (!ret_val.has_value()) { - libc_errno = printf_core::internal_error_to_errno(ret_val.error()); - return -1; - } + int ret_val = printf_core::printf_main(&writer, format, args); wb.buff[wb.buff_cur] = '\0'; - - if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - return static_cast<int>(ret_val.value()); + return ret_val; } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index 1ccdcc8bec148..c464f82dcbda7 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -73,8 +73,6 @@ add_entrypoint_object( strfromf.h DEPENDS .str_from_util - libc.src.__support.CPP.limits - libc.src.stdio.printf_core.error_mapper ) add_entrypoint_object( @@ -85,8 +83,6 @@ add_entrypoint_object( strfromd.h DEPENDS .str_from_util - libc.src.__support.CPP.limits - libc.src.stdio.printf_core.error_mapper ) add_entrypoint_object( @@ -97,8 +93,6 @@ add_entrypoint_object( strfroml.h DEPENDS .str_from_util - libc.src.__support.CPP.limits - libc.src.stdio.printf_core.error_mapper ) add_header_library( diff --git a/libc/src/stdlib/strfromd.cpp b/libc/src/stdlib/strfromd.cpp index 71e257f08645b..f51e6d4c7f1df 100644 --- a/libc/src/stdlib/strfromd.cpp +++ b/libc/src/stdlib/strfromd.cpp @@ -7,10 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfromd.h" -#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -39,13 +36,7 @@ LLVM_LIBC_FUNCTION(int, strfromd, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - if (writer.get_chars_written() > - static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - return static_cast<int>(writer.get_chars_written()); + return writer.get_chars_written(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strfromf.cpp b/libc/src/stdlib/strfromf.cpp index 65f242b200f18..14dbfdb25bab6 100644 --- a/libc/src/stdlib/strfromf.cpp +++ b/libc/src/stdlib/strfromf.cpp @@ -7,10 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfromf.h" -#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -39,13 +36,7 @@ LLVM_LIBC_FUNCTION(int, strfromf, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - if (writer.get_chars_written() > - static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - return static_cast<int>(writer.get_chars_written()); + return writer.get_chars_written(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdlib/strfroml.cpp b/libc/src/stdlib/strfroml.cpp index 31668a0323c93..12f22a8a2fb65 100644 --- a/libc/src/stdlib/strfroml.cpp +++ b/libc/src/stdlib/strfroml.cpp @@ -7,10 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/strfroml.h" -#include "src/__support/CPP/limits.h" #include "src/__support/macros/config.h" -#include "src/stdio/printf_core/core_structs.h" -#include "src/stdio/printf_core/error_mapper.h" #include "src/stdlib/str_from_util.h" namespace LIBC_NAMESPACE_DECL { @@ -44,13 +41,7 @@ LLVM_LIBC_FUNCTION(int, strfroml, if (n > 0) wb.buff[wb.buff_cur] = '\0'; - if (writer.get_chars_written() > - static_cast<size_t>(cpp::numeric_limits<int>::max())) { - libc_errno = - printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR); - return -1; - } - return static_cast<int>(writer.get_chars_written()); + return writer.get_chars_written(); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h index 2b136d83234cd..c7e590627094a 100644 --- a/libc/src/time/strftime_core/strftime_main.h +++ b/libc/src/time/strftime_core/strftime_main.h @@ -36,8 +36,7 @@ int strftime_main(printf_core::Writer<write_mode> *writer, return result; } - // TODO: Use ErrorOr<size_t> - return static_cast<int>(writer->get_chars_written()); + return writer->get_chars_written(); } } // namespace strftime_core diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index d71f1dff11943..eec108bc12ca5 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -186,8 +186,6 @@ add_libc_test( fprintf_test.cpp DEPENDS libc.src.stdio.fprintf - libc.test.UnitTest.ErrnoCheckingTest - libc.test.UnitTest.ErrnoSetterMatcher ${fprintf_test_deps} COMPILE_OPTIONS ${use_system_file} diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp index 2cea7f554ce38..6799323cc6ad9 100644 --- a/libc/test/src/stdio/fprintf_test.cpp +++ b/libc/test/src/stdio/fprintf_test.cpp @@ -15,9 +15,6 @@ #include "src/stdio/fprintf.h" -#include "src/__support/CPP/limits.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" namespace printf_test { @@ -34,8 +31,6 @@ using ::fread; #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace printf_test -using LlvmLibcFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - TEST(LlvmLibcFPrintfTest, WriteToFile) { const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test"); auto FILE_PATH = libc_make_test_file_path(FILENAME); @@ -83,25 +78,6 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) { written = LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); - ASSERT_ERRNO_FAILURE(); - - ASSERT_EQ(printf_test::fclose(file), 0); -} - -#if !defined(LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS) && \ - !defined(LIBC_COPT_PRINTF_DISABLE_WRITE_INT) -TEST(LlvmLibcFPrintfTest, NullPtrCheck) { - const char *FILENAME = APPEND_LIBC_TEST("fprintf_nullptr.test"); - auto FILE_PATH = libc_make_test_file_path(FILENAME); - - ::FILE *file = printf_test::fopen(FILE_PATH, "w"); - ASSERT_FALSE(file == nullptr); - - int ret = - LIBC_NAMESPACE::fprintf(file, "hello %n", static_cast<int *>(nullptr)); - EXPECT_LT(ret, 0); - ASSERT_ERRNO_FAILURE(); ASSERT_EQ(printf_test::fclose(file), 0); } -#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp index 2dae2a22c864c..bf088937e4104 100644 --- a/libc/test/src/stdio/printf_core/converter_test.cpp +++ b/libc/test/src/stdio/printf_core/converter_test.cpp @@ -38,7 +38,7 @@ TEST_F(LlvmLibcPrintfConverterTest, SimpleRawConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "abc"); - ASSERT_EQ(writer.get_chars_written(), size_t{3}); + ASSERT_EQ(writer.get_chars_written(), 3); } TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) { @@ -52,7 +52,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "%"); - ASSERT_EQ(writer.get_chars_written(), size_t{1}); + ASSERT_EQ(writer.get_chars_written(), 1); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) { @@ -70,7 +70,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "D"); - ASSERT_EQ(writer.get_chars_written(), size_t{1}); + ASSERT_EQ(writer.get_chars_written(), 1); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) { @@ -85,7 +85,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, " E"); - ASSERT_EQ(writer.get_chars_written(), size_t{4}); + ASSERT_EQ(writer.get_chars_written(), 4); } TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) { @@ -102,7 +102,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "F "); - ASSERT_EQ(writer.get_chars_written(), size_t{4}); + ASSERT_EQ(writer.get_chars_written(), 4); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { @@ -118,7 +118,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "DEF"); - ASSERT_EQ(writer.get_chars_written(), size_t{3}); + ASSERT_EQ(writer.get_chars_written(), 3); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { @@ -133,7 +133,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "456"); - ASSERT_EQ(writer.get_chars_written(), size_t{3}); + ASSERT_EQ(writer.get_chars_written(), 3); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) { @@ -148,7 +148,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "xy"); - ASSERT_EQ(writer.get_chars_written(), size_t{2}); + ASSERT_EQ(writer.get_chars_written(), 2); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) { @@ -163,7 +163,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, " 789"); - ASSERT_EQ(writer.get_chars_written(), size_t{4}); + ASSERT_EQ(writer.get_chars_written(), 4); } TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) { @@ -180,7 +180,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "ghi "); - ASSERT_EQ(writer.get_chars_written(), size_t{4}); + ASSERT_EQ(writer.get_chars_written(), 4); } TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) { @@ -194,7 +194,7 @@ TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "12345"); - ASSERT_EQ(writer.get_chars_written(), size_t{5}); + ASSERT_EQ(writer.get_chars_written(), 5); } TEST_F(LlvmLibcPrintfConverterTest, HexConversion) { @@ -211,7 +211,7 @@ TEST_F(LlvmLibcPrintfConverterTest, HexConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "0x00000000123456ab"); - ASSERT_EQ(writer.get_chars_written(), size_t{18}); + ASSERT_EQ(writer.get_chars_written(), 18); } TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) { @@ -225,7 +225,7 @@ TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "101010"); - ASSERT_EQ(writer.get_chars_written(), size_t{6}); + ASSERT_EQ(writer.get_chars_written(), 6); } TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) { @@ -239,7 +239,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "0x123456ab"); - ASSERT_EQ(writer.get_chars_written(), size_t{10}); + ASSERT_EQ(writer.get_chars_written(), 10); } TEST_F(LlvmLibcPrintfConverterTest, OctConversion) { @@ -253,5 +253,5 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ(str, "1234"); - ASSERT_EQ(writer.get_chars_written(), size_t{4}); + ASSERT_EQ(writer.get_chars_written(), 4); } diff --git a/libc/test/src/stdio/printf_core/writer_test.cpp b/libc/test/src/stdio/printf_core/writer_test.cpp index d263cf55aa474..d036341be7981 100644 --- a/libc/test/src/stdio/printf_core/writer_test.cpp +++ b/libc/test/src/stdio/printf_core/writer_test.cpp @@ -39,7 +39,7 @@ TEST(LlvmLibcPrintfWriterTest, Write) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abc", str); - ASSERT_EQ(writer.get_chars_written(), size_t{3}); + ASSERT_EQ(writer.get_chars_written(), 3); } TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) { @@ -53,7 +53,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abcDEF123", str); - ASSERT_EQ(writer.get_chars_written(), size_t{9}); + ASSERT_EQ(writer.get_chars_written(), 9); } TEST(LlvmLibcPrintfWriterTest, WriteChars) { @@ -66,7 +66,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteChars) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaa", str); - ASSERT_EQ(writer.get_chars_written(), size_t{3}); + ASSERT_EQ(writer.get_chars_written(), 3); } TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) { @@ -80,7 +80,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDDD111", str); - ASSERT_EQ(writer.get_chars_written(), size_t{9}); + ASSERT_EQ(writer.get_chars_written(), 9); } TEST(LlvmLibcPrintfWriterTest, WriteManyChars) { @@ -102,7 +102,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteManyChars) { "ZZZZZZZZZZ" "ZZZZZZZZZ", str); - ASSERT_EQ(writer.get_chars_written(), size_t{99}); + ASSERT_EQ(writer.get_chars_written(), 99); } TEST(LlvmLibcPrintfWriterTest, MixedWrites) { @@ -117,7 +117,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWrites) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) { @@ -129,7 +129,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("abcDEF1234", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) { @@ -141,7 +141,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("1111111111", str); - ASSERT_EQ(writer.get_chars_written(), size_t{15}); + ASSERT_EQ(writer.get_chars_written(), 15); } TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) { @@ -157,7 +157,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("aaaDEF1114", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) { @@ -175,7 +175,7 @@ TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) { wb.buff[wb.buff_cur] = '\0'; ASSERT_STREQ("", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) { @@ -187,7 +187,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) { writer.write('1', 3); writer.write({"456", 3}); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } struct OutBuff { @@ -226,7 +226,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("abcDEF123456", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) { @@ -246,7 +246,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("111111111111111", str); - ASSERT_EQ(writer.get_chars_written(), size_t{15}); + ASSERT_EQ(writer.get_chars_written(), 15); } TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) { @@ -269,7 +269,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) { @@ -292,7 +292,7 @@ TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) { str[out_buff.cur_pos] = '\0'; ASSERT_STREQ("aaaDEF111456", str); - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); } TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) { @@ -312,7 +312,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) { wb.overflow_write(""); str[out_buff.cur_pos] = '\0'; - ASSERT_EQ(writer.get_chars_written(), size_t{12}); + ASSERT_EQ(writer.get_chars_written(), 12); ASSERT_STREQ("aaaDEF111456", str); } diff --git a/libc/test/src/stdio/snprintf_test.cpp b/libc/test/src/stdio/snprintf_test.cpp index 95507e0885dbf..baaa664cdc9ee 100644 --- a/libc/test/src/stdio/snprintf_test.cpp +++ b/libc/test/src/stdio/snprintf_test.cpp @@ -8,12 +8,8 @@ #include "src/stdio/snprintf.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -using LlvmLibcSNPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - // The sprintf test cases cover testing the shared printf functionality, so // these tests will focus on snprintf exclusive features. @@ -63,14 +59,3 @@ TEST(LlvmLibcSNPrintfTest, NoCutOff) { EXPECT_EQ(written, 10); ASSERT_STREQ(buff, "1234567890"); } - -TEST(LlvmLibcSNPrintfTest, CharsWrittenOverflow) { - char buff[0]; - - // Trigger an overflow in the return value of snprintf by writing more than - // INT_MAX bytes. - int int_max = LIBC_NAMESPACE::cpp::numeric_limits<int>::max(); - int written = LIBC_NAMESPACE::snprintf(buff, 0, "%*stest", int_max, ""); - EXPECT_LT(written, 0); - ASSERT_ERRNO_FAILURE(); -} diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp index 9b5f09db8fd41..f50565a0f68ca 100644 --- a/libc/test/src/stdio/vfprintf_test.cpp +++ b/libc/test/src/stdio/vfprintf_test.cpp @@ -19,8 +19,6 @@ #include "src/stdio/vfprintf.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" namespace printf_test { @@ -46,8 +44,6 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format, return ret; } -using LlvmLibcVFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - TEST(LlvmLibcVFPrintfTest, WriteToFile) { const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test"); auto FILE_PATH = libc_make_test_file_path(FILENAME); @@ -94,7 +90,6 @@ TEST(LlvmLibcVFPrintfTest, WriteToFile) { written = call_vfprintf(file, "Writing to a read only file should fail."); EXPECT_LT(written, 0); - ASSERT_ERRNO_EQ(EBADF); ASSERT_EQ(printf_test::fclose(file), 0); } diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h index fd2e0f120e90e..e82c94499aa11 100644 --- a/libc/test/src/stdlib/StrfromTest.h +++ b/libc/test/src/stdlib/StrfromTest.h @@ -8,8 +8,6 @@ #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/FPBits.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #define ASSERT_STREQ_LEN(actual_written, actual_str, expected_str) \ @@ -17,7 +15,7 @@ EXPECT_STREQ(actual_str, expected_str); template <typename InputT> -class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { +class StrfromTest : public LIBC_NAMESPACE::testing::Test { static constexpr bool is_single_prec = LIBC_NAMESPACE::cpp::is_same<InputT, float>::value; @@ -483,16 +481,6 @@ class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { written = func(buff, 10, "%A", -ld_nan); ASSERT_STREQ_LEN(written, buff, "-NAN"); } - - void charsWrittenOverflow(FunctionT func) { - char buff[100]; - // Trigger an overflow in the return value of strfrom by writing more than - // INT_MAX bytes. - int result = func(buff, sizeof(buff), "%.2147483647f", 1.0f); - - EXPECT_LT(result, 0); - ASSERT_ERRNO_FAILURE(); - } }; #define STRFROM_TEST(InputType, name, func) \ @@ -513,7 +501,4 @@ class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { TEST_F(LlvmLibc##name##Test, InsufficientBufferSize) { \ insufficentBufsize(func); \ } \ - TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); } \ - TEST_F(LlvmLibc##name##Test, CharsWrittenOverflow) { \ - charsWrittenOverflow(func); \ - } + TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); } From cdf52a1325e6329cef267ca93886481ddc5d554d Mon Sep 17 00:00:00 2001 From: Thurston Dang <thurston@google.com> Date: Wed, 5 Nov 2025 03:52:25 +0000 Subject: [PATCH 274/313] [msan][NFCI] Generalize handleVectorPmaddIntrinsic() (#166282) This generalizes `handleVectorPmaddIntrinsic()`: - potentially handle floating-point type intrinsics (e.g., `llvm.x86.avx512bf16.dpbf16ps.512`). This usage is not enabled yet. - "multiplication with an initialized zero guarantees that the corresponding output becomes initialized" is now gated by a parameter --- .../Instrumentation/MemorySanitizer.cpp | 77 ++++++++++++------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 471c6ec633a57..ceeece41782f4 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // adding/"accumulating" %s. "Accumulation" stores the result in one // of the source registers, but this accumulate vs. add distinction // is lost when dealing with LLVM intrinsics.) + // + // ZeroPurifies means that multiplying a known-zero with an uninitialized + // value results in an initialized value. This is applicable for integer + // multiplication, but not floating-point (counter-example: NaN). void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor, + bool ZeroPurifies, unsigned EltSizeInBits = 0) { IRBuilder<> IRB(&I); @@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { assert(AccumulatorType == ReturnType); } - FixedVectorType *ImplicitReturnType = ReturnType; + FixedVectorType *ImplicitReturnType = + cast<FixedVectorType>(getShadowTy(ReturnType)); // Step 1: instrument multiplication of corresponding vector elements if (EltSizeInBits) { ImplicitReturnType = cast<FixedVectorType>( @@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { ReturnType->getNumElements() * ReductionFactor); } - // Multiplying an *initialized* zero by an uninitialized element results in - // an initialized zero element. - // - // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value - // results in an unpoisoned value. We can therefore adapt the visitAnd() - // instrumentation: - // OutShadow = (SaNonZero & SbNonZero) - // | (VaNonZero & SbNonZero) - // | (SaNonZero & VbNonZero) - // where non-zero is checked on a per-element basis (not per bit). - Value *SZero = Constant::getNullValue(Va->getType()); - Value *VZero = Constant::getNullValue(Sa->getType()); - Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero); - Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero); - Value *VaNonZero = IRB.CreateICmpNE(Va, VZero); - Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero); - - Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); - Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); - Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); - // Each element of the vector is represented by a single bit (poisoned or // not) e.g., <8 x i1>. - Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + Value *SaNonZero = IRB.CreateIsNotNull(Sa); + Value *SbNonZero = IRB.CreateIsNotNull(Sb); + Value *And; + if (ZeroPurifies) { + // Multiplying an *initialized* zero by an uninitialized element results + // in an initialized zero element. + // + // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value + // results in an unpoisoned value. We can therefore adapt the visitAnd() + // instrumentation: + // OutShadow = (SaNonZero & SbNonZero) + // | (VaNonZero & SbNonZero) + // | (SaNonZero & VbNonZero) + // where non-zero is checked on a per-element basis (not per bit). + Value *VaInt = Va; + Value *VbInt = Vb; + if (!Va->getType()->isIntegerTy()) { + VaInt = CreateAppToShadowCast(IRB, Va); + VbInt = CreateAppToShadowCast(IRB, Vb); + } + + Value *VaNonZero = IRB.CreateIsNotNull(VaInt); + Value *VbNonZero = IRB.CreateIsNotNull(VbInt); + + Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); + Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); + Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); + + And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + } else { + And = IRB.CreateOr({SaNonZero, SbNonZero}); + } // Extend <8 x i1> to <8 x i16>. // (The real pmadd intrinsic would have computed intermediate values of @@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_ssse3_pmadd_ub_sw_128: case Intrinsic::x86_avx2_pmadd_ub_sw: case Intrinsic::x86_avx512_pmaddubs_w_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true); break; // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) case Intrinsic::x86_ssse3_pmadd_ub_sw: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) case Intrinsic::x86_mmx_pmadd_wd: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // AVX Vector Neural Network Instructions: bytes @@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx2_vpdpbuuds_128: case Intrinsic::x86_avx2_vpdpbuuds_256: case Intrinsic::x86_avx10_vpdpbuuds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // AVX Vector Neural Network Instructions: words @@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx512_vpdpwssds_128: case Intrinsic::x86_avx512_vpdpwssds_256: case Intrinsic::x86_avx512_vpdpwssds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single From b5f200129ad96f87bce11e5c8f0eafeb00b70b9c Mon Sep 17 00:00:00 2001 From: Vigneshwar Jayakumar <vigneshwar.jayakumar@amd.com> Date: Tue, 4 Nov 2025 22:40:40 -0600 Subject: [PATCH 275/313] [CodeGen] Register-coalescer remat fix subreg liveness (#165662) This is a bugfix in rematerialization where the liveness of subreg mask was incorrectly updated causing crash in scheduler. --- llvm/lib/CodeGen/RegisterCoalescer.cpp | 24 ++-- .../AMDGPU/reg-coalescer-subreg-liveness.mir | 131 ++++++++++++++++++ .../SystemZ/regcoal_remat_empty_subrange.ll | 4 +- 3 files changed, 149 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 38f6deb39ddf3..99f76936a180f 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -1600,6 +1600,22 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, SlotIndex DefIndex = CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber()); VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator(); + + // Refine the subranges that are now defined by the remat. + // This will split existing subranges if necessary. + DstInt.refineSubRanges( + Alloc, DstMask, + [&DefIndex, &Alloc](LiveInterval::SubRange &SR) { + // We know that this lane is defined by this instruction, + // but at this point it might not be live because it was not defined + // by the original instruction. This happens when the + // rematerialization widens the defined register. Assign that lane a + // dead def so that the interferences are properly modeled. + if (!SR.liveAt(DefIndex)) + SR.createDeadDef(DefIndex, Alloc); + }, + *LIS->getSlotIndexes(), *TRI); + for (LiveInterval::SubRange &SR : DstInt.subranges()) { if ((SR.LaneMask & DstMask).none()) { LLVM_DEBUG(dbgs() @@ -1617,14 +1633,6 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, // updateRegDefUses. The original subrange def may have only undefed // some lanes. UpdatedSubRanges = true; - } else { - // We know that this lane is defined by this instruction, - // but at this point it might not be live because it was not defined - // by the original instruction. This happens when the - // rematerialization widens the defined register. Assign that lane a - // dead def so that the interferences are properly modeled. - if (!SR.liveAt(DefIndex)) - SR.createDeadDef(DefIndex, Alloc); } } if (UpdatedSubRanges) diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir new file mode 100644 index 0000000000000..381cb8c9d1047 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir @@ -0,0 +1,131 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=register-coalescer -verify-coalescing -o - %s | FileCheck %s + +# This test is to check fix for failure with "Bad machine code: Defining instruction does not modify register" due to corrupt lane mask. + +--- +name: reg_coalescer_subreg_liveness +tracksRegLiveness: true +liveins: +body: | + ; CHECK-LABEL: name: reg_coalescer_subreg_liveness + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0 + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: $vcc_lo = COPY $exec_lo + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc_lo + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x80000000) + liveins: $sgpr4_sgpr5 + + %0:sgpr_64 = COPY killed $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %2:sreg_32 = S_MOV_B32 1 + undef %3.sub0:sgpr_128 = COPY %2 + %4:sreg_32 = S_MOV_B32 0 + undef %5.sub0:sgpr_256 = COPY %4 + TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + %6:sgpr_128 = COPY killed %3 + %6.sub1:sgpr_128 = COPY killed %1 + %7:sreg_32 = COPY $exec_lo + %8:sreg_32 = COPY %2 + %9:sreg_32 = COPY %4 + + bb.1: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + + %10:sreg_32 = COPY killed %8 + undef %11.sub0:sgpr_128 = COPY %2 + %11.sub1:sgpr_128 = COPY killed %10 + %11.sub2:sgpr_128 = COPY %2 + %11.sub3:sgpr_128 = COPY %2 + TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + %12:sreg_32 = COPY killed %9 + %13:sgpr_128 = COPY %6 + %13.sub2:sgpr_128 = COPY killed %12 + TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + $vcc_lo = COPY %7 + %8:sreg_32 = COPY %4 + %9:sreg_32 = COPY %2 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... +--- +name: reg_coalescer_subreg_liveness_2 +tracksRegLiveness: true +liveins: +body: | + ; CHECK-LABEL: name: reg_coalescer_subreg_liveness_2 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit [[S_LOAD_DWORD_IMM]], implicit [[S_MOV_B32_]] + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x80000000) + liveins: $sgpr4_sgpr5 + + %0:sgpr_64 = COPY killed $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %3:sreg_32 = S_MOV_B32 1 + undef %4.sub0:sgpr_128 = COPY %3 + %5:sgpr_128 = COPY %4 + %5.sub1:sgpr_128 = COPY killed %2 + %6:sgpr_128 = COPY %5 + %6.sub2:sgpr_128 = COPY killed %1 + %7:sreg_32 = S_MOV_B32 0 + undef %8.sub0:sgpr_256 = COPY %7 + %9:sreg_32 = COPY %3 + + bb.1: + successors: %bb.2(0x80000000) + + %10:sreg_32 = COPY killed %9 + undef %11.sub0:sgpr_128 = COPY %3 + %11.sub1:sgpr_128 = COPY killed %10 + S_NOP 0, implicit %5, implicit %8 + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll index 678d9a9073155..ff9b6a34c1d53 100644 --- a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll +++ b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll @@ -22,10 +22,10 @@ define void @main(i16 %in) { ; CHECK-NEXT: locghile %r3, 1 ; CHECK-NEXT: o %r0, 0(%r1) ; CHECK-NEXT: larl %r1, g_222 -; CHECK-NEXT: lghi %r5, 0 ; CHECK-NEXT: dsgfr %r2, %r0 +; CHECK-NEXT: lghi %r3, 0 ; CHECK-NEXT: stgrl %r2, g_39 -; CHECK-NEXT: stc %r5, 19(%r1) +; CHECK-NEXT: stc %r3, 19(%r1) ; CHECK-NEXT: br %r14 %tmp = load i32, ptr @g_151, align 4 %tmp3 = or i32 %tmp, 1 From 6111ff16df372fd78cf9dc8fa953573ea2446f82 Mon Sep 17 00:00:00 2001 From: Sudharsan Veeravalli <quic_svs@quicinc.com> Date: Wed, 5 Nov 2025 10:22:11 +0530 Subject: [PATCH 276/313] [RISCV] Implement shouldFoldMaskToVariableShiftPair (#166159) Folding a mask to a variable shift pair results in better code size as long as they are scalars that are <= XLen. Similar to https://github.com/llvm/llvm-project/pull/158069 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 9 ++ llvm/lib/Target/RISCV/RISCVISelLowering.h | 2 + .../test/CodeGen/RISCV/mask-variable-shift.ll | 132 ++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/mask-variable-shift.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index b8605629e2dfe..c3f100e3197b1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -25309,3 +25309,12 @@ ArrayRef<MCPhysReg> RISCVTargetLowering::getRoundingControlRegisters() const { } return {}; } + +bool RISCVTargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { + EVT VT = Y.getValueType(); + + if (VT.isVector()) + return false; + + return VT.getSizeInBits() <= Subtarget.getXLen(); +} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 9e3e2a9443625..dd62a9cf6c9e2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -465,6 +465,8 @@ class RISCVTargetLowering : public TargetLowering { ArrayRef<MCPhysReg> getRoundingControlRegisters() const override; + bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; + /// Match a mask which "spreads" the leading elements of a vector evenly /// across the result. Factor is the spread amount, and Index is the /// offset applied. diff --git a/llvm/test/CodeGen/RISCV/mask-variable-shift.ll b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll new file mode 100644 index 0000000000000..4e73cee30ef08 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV32 +; RUN: llc -mtriple=riscv64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV64 + +define i32 @mask_pair(i32 %x, i32 %y) { +; RV32-LABEL: mask_pair: +; RV32: # %bb.0: +; RV32-NEXT: srl a0, a0, a1 +; RV32-NEXT: sll a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: mask_pair: +; RV64: # %bb.0: +; RV64-NEXT: srlw a0, a0, a1 +; RV64-NEXT: sllw a0, a0, a1 +; RV64-NEXT: ret + %shl = shl nsw i32 -1, %y + %and = and i32 %shl, %x + ret i32 %and +} + +define i64 @mask_pair_64(i64 %x, i64 %y) { +; RV32-LABEL: mask_pair_64: +; RV32: # %bb.0: +; RV32-NEXT: li a3, -1 +; RV32-NEXT: addi a4, a2, -32 +; RV32-NEXT: sll a3, a3, a2 +; RV32-NEXT: bltz a4, .LBB1_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: j .LBB1_3 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: not a2, a2 +; RV32-NEXT: lui a5, 524288 +; RV32-NEXT: addi a5, a5, -1 +; RV32-NEXT: srl a2, a5, a2 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: .LBB1_3: +; RV32-NEXT: srai a4, a4, 31 +; RV32-NEXT: and a3, a4, a3 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: mask_pair_64: +; RV64: # %bb.0: +; RV64-NEXT: srl a0, a0, a1 +; RV64-NEXT: sll a0, a0, a1 +; RV64-NEXT: ret + %shl = shl nsw i64 -1, %y + %and = and i64 %shl, %x + ret i64 %and +} + +define i128 @mask_pair_128(i128 %x, i128 %y) { +; RV32-LABEL: mask_pair_128: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lw a5, 0(a1) +; RV32-NEXT: lw a4, 4(a1) +; RV32-NEXT: lw a3, 8(a1) +; RV32-NEXT: lw a1, 12(a1) +; RV32-NEXT: lw a2, 0(a2) +; RV32-NEXT: li a6, -1 +; RV32-NEXT: sw zero, 0(sp) +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: sw zero, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: addi a7, sp, 16 +; RV32-NEXT: sw a6, 16(sp) +; RV32-NEXT: sw a6, 20(sp) +; RV32-NEXT: sw a6, 24(sp) +; RV32-NEXT: sw a6, 28(sp) +; RV32-NEXT: srli a6, a2, 3 +; RV32-NEXT: andi a6, a6, 12 +; RV32-NEXT: sub a6, a7, a6 +; RV32-NEXT: lw a7, 4(a6) +; RV32-NEXT: lw t0, 8(a6) +; RV32-NEXT: lw t1, 12(a6) +; RV32-NEXT: lw a6, 0(a6) +; RV32-NEXT: andi t2, a2, 31 +; RV32-NEXT: xori t2, t2, 31 +; RV32-NEXT: sll t1, t1, a2 +; RV32-NEXT: srli t3, t0, 1 +; RV32-NEXT: sll t0, t0, a2 +; RV32-NEXT: srli t4, a7, 1 +; RV32-NEXT: sll a7, a7, a2 +; RV32-NEXT: sll a2, a6, a2 +; RV32-NEXT: srli a6, a6, 1 +; RV32-NEXT: srl t3, t3, t2 +; RV32-NEXT: srl t4, t4, t2 +; RV32-NEXT: srl a6, a6, t2 +; RV32-NEXT: and a2, a2, a5 +; RV32-NEXT: or a5, t1, t3 +; RV32-NEXT: or t0, t0, t4 +; RV32-NEXT: or a6, a7, a6 +; RV32-NEXT: and a4, a6, a4 +; RV32-NEXT: and a3, t0, a3 +; RV32-NEXT: and a1, a5, a1 +; RV32-NEXT: sw a2, 0(a0) +; RV32-NEXT: sw a4, 4(a0) +; RV32-NEXT: sw a3, 8(a0) +; RV32-NEXT: sw a1, 12(a0) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: mask_pair_128: +; RV64: # %bb.0: +; RV64-NEXT: li a5, -1 +; RV64-NEXT: addi a4, a2, -64 +; RV64-NEXT: sll a3, a5, a2 +; RV64-NEXT: bltz a4, .LBB2_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: j .LBB2_3 +; RV64-NEXT: .LBB2_2: +; RV64-NEXT: not a2, a2 +; RV64-NEXT: srli a5, a5, 1 +; RV64-NEXT: srl a2, a5, a2 +; RV64-NEXT: or a2, a3, a2 +; RV64-NEXT: .LBB2_3: +; RV64-NEXT: srai a4, a4, 63 +; RV64-NEXT: and a3, a4, a3 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: and a0, a3, a0 +; RV64-NEXT: ret + %shl = shl nsw i128 -1, %y + %and = and i128 %shl, %x + ret i128 %and +} From 849038cad16f18d77b5cd277980c93e8efbf1bbc Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Tue, 4 Nov 2025 20:53:42 -0800 Subject: [PATCH 277/313] AMDGPU: Do not infer implicit inputs for !nocallback intrinsics (#131759) This isn't really the right check, we want to know that the intrinsic does not perform a true function call to any code (in the module or not). nocallback appears to be the closest thing to this property we have now though. Fixes theoretically miscompiles with intrinsics like statepoint, which hide a call to a real function. Also do the same for inferring no-agpr usage. --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 27 ++++++- ...attributor-intrinsic-missing-nocallback.ll | 31 ++++++++ ...amdgpu-attributor-nocallback-intrinsics.ll | 74 +++++++++++++++++++ 3 files changed, 128 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 8669978637f40..56ab040706a13 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -38,9 +38,10 @@ enum ImplicitArgumentPositions { #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, enum ImplicitArgumentMask { - NOT_IMPLICIT_INPUT = 0, + UNKNOWN_INTRINSIC = 0, #include "AMDGPUAttributes.def" - ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 + ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1, + NOT_IMPLICIT_INPUT }; #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, @@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); return QUEUE_PTR; default: - return NOT_IMPLICIT_INPUT; + return UNKNOWN_INTRINSIC; } } @@ -534,6 +535,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { ImplicitArgumentMask AttrMask = intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, HasApertureRegs, SupportsGetDoorbellID, COV); + + if (AttrMask == UNKNOWN_INTRINSIC) { + // Assume not-nocallback intrinsics may invoke a function which accesses + // implicit arguments. + // + // FIXME: This isn't really the correct check. We want to ensure it + // isn't calling any function that may use implicit arguments regardless + // of whether it's internal to the module or not. + // + // TODO: Ignoring callsite attributes. + if (!Callee->hasFnAttribute(Attribute::NoCallback)) + return indicatePessimisticFixpoint(); + continue; + } + if (AttrMask != NOT_IMPLICIT_INPUT) { if ((IsNonEntryFunc || !NonKernelOnly)) removeAssumedBits(AttrMask); @@ -1357,7 +1373,10 @@ struct AAAMDGPUMinAGPRAlloc default: // Some intrinsics may use AGPRs, but if we have a choice, we are not // required to use AGPRs. - return true; + + // Assume !nocallback intrinsics may call a function which requires + // AGPRs. + return CB.hasFnAttr(Attribute::NoCallback); } // TODO: Handle callsite attributes diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll new file mode 100644 index 0000000000000..d7d623ac89146 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s + +; Make sure we do not infer anything about implicit inputs through an +; intrinsic call which is not nocallback. + +declare zeroext i32 @return_i32() + +define i32 @test_i32_return() gc "statepoint-example" { +; CHECK-LABEL: define i32 @test_i32_return( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]]) +; CHECK-NEXT: ret i32 [[CALL1]] +; +entry: + %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0) + %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token) + ret i32 %call1 +} + +declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...) +declare i32 @llvm.experimental.gc.result.i32(token) #0 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll new file mode 100644 index 0000000000000..71c509afa8e64 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s + +; Make sure we infer no inputs are used through some intrinsics + +define void @use_fake_use(i32 %arg) { +; CHECK-LABEL: define void @use_fake_use( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ARG]]) +; CHECK-NEXT: ret void +; + call void (...) @llvm.fake.use(i32 %arg) + ret void +} + +define void @use_donothing() { +; CHECK-LABEL: define void @use_donothing( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: ret void +; + call void @llvm.donothing() + ret void +} + +define void @use_assume(i1 %arg) { +; CHECK-LABEL: define void @use_assume( +; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.assume(i1 [[ARG]]) +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 %arg) + ret void +} + +define void @use_trap() { +; CHECK-LABEL: define void @use_trap( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: ret void +; + call void @llvm.trap() + ret void +} + +define void @use_debugtrap() { +; CHECK-LABEL: define void @use_debugtrap( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.debugtrap() +; CHECK-NEXT: ret void +; + call void @llvm.debugtrap() + ret void +} + +define void @use_ubsantrap() { +; CHECK-LABEL: define void @use_ubsantrap( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.ubsantrap(i8 0) +; CHECK-NEXT: ret void +; + call void @llvm.ubsantrap(i8 0) + ret void +} + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" } +;. From 044e0f041d5c02ae3b4e44bab4647008ff497d9f Mon Sep 17 00:00:00 2001 From: Matt Arsenault <Matthew.Arsenault@amd.com> Date: Tue, 4 Nov 2025 21:11:03 -0800 Subject: [PATCH 278/313] Revert "IR: Remove null UseList checks in hasNUses methods (#165929)" (#166500) This reverts commit 93e860e694770f52a9eeecda88ba11173c291ef8. hasOneUse still has the null check, and it seems bad to be logically inconsistent across multiple of these predicate functions. --- llvm/lib/IR/Value.cpp | 8 +++++++ llvm/unittests/IR/ConstantsTest.cpp | 36 ++++++++--------------------- 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 95d61a987f6c1..b775cbb0c7920 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -148,10 +148,18 @@ void Value::destroyValueName() { } bool Value::hasNUses(unsigned N) const { + if (!UseList) + return N == 0; + + // TODO: Disallow for ConstantData and remove !UseList check? return hasNItems(use_begin(), use_end(), N); } bool Value::hasNUsesOrMore(unsigned N) const { + // TODO: Disallow for ConstantData and remove !UseList check? + if (!UseList) + return N == 0; + return hasNItemsOrMore(use_begin(), use_end(), N); } diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp index 9cb9e1236b2d1..6376165cbe766 100644 --- a/llvm/unittests/IR/ConstantsTest.cpp +++ b/llvm/unittests/IR/ConstantsTest.cpp @@ -29,8 +29,13 @@ TEST(ConstantsTest, UseCounts) { EXPECT_TRUE(Zero->use_empty()); EXPECT_EQ(Zero->getNumUses(), 0u); + EXPECT_TRUE(Zero->hasNUses(0)); EXPECT_FALSE(Zero->hasOneUse()); EXPECT_FALSE(Zero->hasOneUser()); + EXPECT_FALSE(Zero->hasNUses(1)); + EXPECT_FALSE(Zero->hasNUsesOrMore(1)); + EXPECT_FALSE(Zero->hasNUses(2)); + EXPECT_FALSE(Zero->hasNUsesOrMore(2)); std::unique_ptr<Module> M(new Module("MyModule", Context)); @@ -45,36 +50,15 @@ TEST(ConstantsTest, UseCounts) { // Still looks like use_empty with uses. EXPECT_TRUE(Zero->use_empty()); EXPECT_EQ(Zero->getNumUses(), 0u); + EXPECT_TRUE(Zero->hasNUses(0)); EXPECT_FALSE(Zero->hasOneUse()); EXPECT_FALSE(Zero->hasOneUser()); + EXPECT_FALSE(Zero->hasNUses(1)); + EXPECT_FALSE(Zero->hasNUsesOrMore(1)); + EXPECT_FALSE(Zero->hasNUses(2)); + EXPECT_FALSE(Zero->hasNUsesOrMore(2)); } -#ifdef GTEST_HAS_DEATH_TEST -#ifndef NDEBUG - -TEST(ConstantsTest, hasNUsesInvalid) { - LLVMContext Context; - Type *Int32Ty = Type::getInt32Ty(Context); - Constant *Zero = ConstantInt::get(Int32Ty, 0); - std::unique_ptr<Module> M(new Module("MyModule", Context)); - - // Introduce some uses - new GlobalVariable(*M, Int32Ty, /*isConstant=*/false, - GlobalValue::ExternalLinkage, /*Initializer=*/Zero, - "gv_user0"); - new GlobalVariable(*M, Int32Ty, /*isConstant=*/false, - GlobalValue::ExternalLinkage, /*Initializer=*/Zero, - "gv_user1"); - - for (int I = 0; I != 3; ++I) { - EXPECT_DEATH(Zero->hasNUses(I), "hasUseList\\(\\)"); - EXPECT_DEATH(Zero->hasNUsesOrMore(I), "hasUseList\\(\\)"); - } -} - -#endif -#endif - TEST(ConstantsTest, Integer_i1) { LLVMContext Context; IntegerType *Int1 = IntegerType::get(Context, 1); From 87b1d3537ae6adcb0a16cc0fa2749862d01009d3 Mon Sep 17 00:00:00 2001 From: LU-JOHN <John.Lu@amd.com> Date: Tue, 4 Nov 2025 23:18:40 -0600 Subject: [PATCH 279/313] [AMDGPU][NFC] Avoid copying MachineOperands (#166293) Avoid copying machine operands. Signed-off-by: John Lu <John.Lu@amd.com> --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 6 +++--- llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 ++++---- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 957d7164b686e..15ed60b46a9c0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -6764,7 +6764,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(2); std::optional<int64_t> BarValImm = getIConstantVRegSExtVal(BarOp.getReg(), *MRI); @@ -6817,8 +6817,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(1); - MachineOperand CntOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(1); + const MachineOperand &CntOp = I.getOperand(2); // BarID = (BarOp >> 4) & 0x3F Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9485e4d..62172a0bb89db 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -136,7 +136,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { continue; if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { - MachineOperand DefSrcMO = Def.getOperand(1); + const MachineOperand &DefSrcMO = Def.getOperand(1); // Immediates are not an issue and can be propagated in // postrapseudos pass. Only handle cases where defining diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 45f591927b86e..9460145d47111 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7945,7 +7945,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; @@ -7985,7 +7985,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; @@ -8183,7 +8183,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, AMDGPU::OpName::src0_modifiers) >= 0) NewInstr.addImm(0); if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) { - MachineOperand Src = Inst.getOperand(1); + const MachineOperand &Src = Inst.getOperand(1); NewInstr->addOperand(Src); } @@ -9199,7 +9199,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, +void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dc23a21f959ce..0643b532ea04c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -172,7 +172,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, SIInstrWorklist &Worklist) const; - void addSCCDefUsersToVALUWorklist(MachineOperand &Op, + void addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond = Register()) const; diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index abefa32b8f802..8785968569d92 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -640,7 +640,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( } void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { - MachineOperand DstOp = I.getOperand(0); + const MachineOperand &DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() && From 4c2a9c4ba3796799032c12673510279d51c65370 Mon Sep 17 00:00:00 2001 From: Thurston Dang <thurston@google.com> Date: Wed, 5 Nov 2025 05:47:35 +0000 Subject: [PATCH 280/313] [msan][test] Add some avx512bf16 tests (#166219) Forked from llvm/test/CodeGen/X86 --- .../X86/avx512bf16-intrinsics.ll | 355 ++++++++ .../MemorySanitizer/X86/avx512bf16-mov.ll | 123 +++ .../X86/avx512bf16-vl-intrinsics.ll | 774 ++++++++++++++++++ 3 files changed, 1252 insertions(+) create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll new file mode 100644 index 0000000000000..877fe5fe4b393 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll @@ -0,0 +1,355 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll +; +; Strictly handled: +; - llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) +; - llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) +; - llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) +; +; Heuristically handled: (none) + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) #3 + +define <8 x i64> @test_mm512_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x i64> @test_mm512_cvtne2ps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <32 x bfloat> [[TMP6]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP7]] +; +entry: + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast <32 x bfloat> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B, i32 %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP7:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32 [[TMP2]] to <32 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32 [[U]] to <32 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <32 x bfloat> [[TMP7]] to <32 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = xor <32 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP8]], <32 x i16> [[TMP14]], <32 x i16> [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = select <32 x i1> [[TMP9]], <32 x bfloat> [[TMP7]], <32 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x bfloat> [[TMP15]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; +entry: + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast i32 %U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer + %3 = bitcast <32 x bfloat> %2 to <8 x i64> + ret <8 x i64> %3 +} + +define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(<8 x i64> %C, i32 %U, <16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512( +; CHECK-SAME: <8 x i64> [[C:%.*]], i32 [[U:%.*]], <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[C]] to <32 x bfloat> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP3]] to <32 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32 [[U]] to <32 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <32 x i1> [[TMP12]], <32 x i16> zeroinitializer, <32 x i16> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <32 x bfloat> [[TMP8]] to <32 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <32 x bfloat> [[TMP10]] to <32 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = xor <32 x i16> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i16> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = select <32 x i1> [[TMP12]], <32 x bfloat> [[TMP8]], <32 x bfloat> [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x bfloat> [[TMP19]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP21]] +; +entry: + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast <8 x i64> %C to <32 x bfloat> + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x bfloat> %0, <32 x bfloat> %1 + %4 = bitcast <32 x bfloat> %3 to <8 x i64> + ret <8 x i64> %4 +} + +declare <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) #3 + +define <4 x i64> @test_mm512_cvtneps2bf16_512(<16 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm512_cvtneps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]] +; CHECK: [[BB2]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x bfloat> [[TMP4]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP5]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast <16 x bfloat> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(<16 x float> %A, i16 %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]], i16 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[TMP5:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x bfloat> [[TMP5]] to <16 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i16> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i16> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i16> [[TMP12]], <16 x i16> [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x bfloat> [[TMP5]], <16 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP13]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP15]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + %3 = bitcast <16 x bfloat> %2 to <4 x i64> + ret <4 x i64> %3 +} + +define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(<4 x i64> %C, i16 %U, <16 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm512_mask_cvtneps2bf16_512( +; CHECK-SAME: <4 x i64> [[C:%.*]], i16 [[U:%.*]], <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP1]] to <16 x i16> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i16> zeroinitializer, <16 x i16> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <16 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i16> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i16> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i16> [[TMP15]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> [[TMP16]], <16 x i16> [[TMP11]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x bfloat> [[TMP6]], <16 x bfloat> [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x bfloat> [[TMP17]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP19]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast <4 x i64> %C to <16 x bfloat> + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1 + %4 = bitcast <16 x bfloat> %3 to <4 x i64> + ret <4 x i64> %4 +} + +declare <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <32 x bfloat>, <32 x bfloat>) #3 + +define <16 x float> @test_mm512_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_dpbf16ps_512( +; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP8]] +; +entry: + %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4 + ret <16 x float> %0 +} + +define <16 x float> @test_mm512_maskz_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B, i16 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_maskz_dpbf16ps_512( +; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP16]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP17]] +; +entry: + %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer + ret <16 x float> %2 +} +define <16 x float> @test_mm512_mask_dpbf16ps_512(i16 zeroext %U, <16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_mask_dpbf16ps_512( +; CHECK-SAME: i16 zeroext [[U:%.*]], <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[E]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP0]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[E]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; +entry: + %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %E + ret <16 x float> %2 +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll new file mode 100644 index 0000000000000..ac65645a9ec2c --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512bf16-mov.ll +; +; Strictly handled: (none) +; +; Heuristically handled: (none) + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local void @funbf16(ptr readonly %src, ptr writeonly %dst) sanitize_memory { +; CHECK-LABEL: define dso_local void @funbf16( +; CHECK-SAME: ptr readonly [[SRC:%.*]], ptr writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB2]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP4]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080 +; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr +; CHECK-NEXT: store <8 x i16> [[_MSLD]], ptr [[TMP12]], align 1 +; CHECK-NEXT: store <8 x bfloat> [[TMP4]], ptr [[DST]], align 1 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP5]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 32 +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = xor i64 [[TMP16]], 87960930222080 +; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr +; CHECK-NEXT: [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP18]], align 32 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB19:.*]], label %[[BB20:.*]], !prof [[PROF1]] +; CHECK: [[BB19]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB20]]: +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080 +; CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr +; CHECK-NEXT: store <8 x i16> [[_MSLD1]], ptr [[TMP23]], align 32 +; CHECK-NEXT: store <8 x bfloat> [[TMP15]], ptr [[DST]], align 32 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP7]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]] +; CHECK: [[BB24]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB25]]: +; CHECK-NEXT: [[TMP26:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP28:%.*]] = xor i64 [[TMP27]], 87960930222080 +; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr +; CHECK-NEXT: [[_MSLD2:%.*]] = load <16 x i16>, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP8]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]] +; CHECK: [[BB30]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB31]]: +; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP33:%.*]] = xor i64 [[TMP32]], 87960930222080 +; CHECK-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr +; CHECK-NEXT: store <16 x i16> [[_MSLD2]], ptr [[TMP34]], align 1 +; CHECK-NEXT: store <16 x bfloat> [[TMP26]], ptr [[DST]], align 1 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP9]], label %[[BB35:.*]], label %[[BB36:.*]], !prof [[PROF1]] +; CHECK: [[BB35]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB36]]: +; CHECK-NEXT: [[TMP37:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 32 +; CHECK-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080 +; CHECK-NEXT: [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr +; CHECK-NEXT: [[_MSLD3:%.*]] = load <16 x i16>, ptr [[TMP40]], align 32 +; CHECK-NEXT: [[_MSCMP10:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP10]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF1]] +; CHECK: [[BB41]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB42]]: +; CHECK-NEXT: [[TMP43:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP44:%.*]] = xor i64 [[TMP43]], 87960930222080 +; CHECK-NEXT: [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr +; CHECK-NEXT: store <16 x i16> [[_MSLD3]], ptr [[TMP45]], align 32 +; CHECK-NEXT: store <16 x bfloat> [[TMP37]], ptr [[DST]], align 32 +; CHECK-NEXT: ret void +; +entry: + %0 = load <8 x bfloat>, ptr %src, align 1 + store <8 x bfloat> %0, ptr %dst, align 1 + %1 = load <8 x bfloat>, ptr %src, align 32 + store <8 x bfloat> %1, ptr %dst, align 32 + %2 = load <16 x bfloat>, ptr %src, align 1 + store <16 x bfloat> %2, ptr %dst, align 1 + %3 = load <16 x bfloat>, ptr %src, align 32 + store <16 x bfloat> %3, ptr %dst, align 32 + ret void +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll new file mode 100644 index 0000000000000..904614e961d6c --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll @@ -0,0 +1,774 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=msan -mattr=+avx512bf16 -mattr=+avx512vl < %s | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll +; +; Strictly handled: +; - llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) +; - llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) +; - llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) +; - llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) +; - llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) +; - llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %6, <4 x i1> %4) +; +; Heuristically handled: (none) + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1 + +define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm_cvtne2ps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i16> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i16> [[TMP14]], <8 x i16> [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP9]], <8 x bfloat> [[TMP7]], <8 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP15]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP17]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + %3 = bitcast <8 x bfloat> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i16> zeroinitializer, <8 x i16> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <8 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i16> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i16> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i16> [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i16> [[TMP18]], <8 x i16> [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP12]], <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP21]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast <2 x i64> %C to <8 x bfloat> + %2 = bitcast i8 %U to <8 x i1> + %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3 + +define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm256_cvtne2ps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP7]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast <16 x bfloat> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP7:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x bfloat> [[TMP7]] to <16 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i16> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i16> [[TMP14]], <16 x i16> [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP9]], <16 x bfloat> [[TMP7]], <16 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x bfloat> [[TMP15]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP17]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + %3 = bitcast <16 x bfloat> %2 to <4 x i64> + ret <4 x i64> %3 +} + +define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256( +; CHECK-SAME: <4 x i64> [[C:%.*]], i16 zeroext [[U:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to <16 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i16> zeroinitializer, <16 x i16> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP10]] to <16 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i16> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i16> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i16> [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i16> [[TMP18]], <16 x i16> [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP12]], <16 x bfloat> [[TMP8]], <16 x bfloat> [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x bfloat> [[TMP19]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP21]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast <4 x i64> %C to <16 x bfloat> + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1 + %4 = bitcast <16 x bfloat> %3 to <4 x i64> + ret <4 x i64> %4 +} + +declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3 + +define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm256_cvtneps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]] +; CHECK: [[BB2]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[TMP5:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i16> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i16> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP12]], <8 x i16> [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x bfloat> [[TMP5]], <8 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP15]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + %3 = bitcast <8 x bfloat> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm256_mask_cvtneps2bf16_256( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i16> zeroinitializer, <8 x i16> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i16> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i16> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i16> [[TMP15]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> [[TMP16]], <8 x i16> [[TMP11]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP10]], <8 x bfloat> [[TMP6]], <8 x bfloat> [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP19]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast <2 x i64> %C to <8 x bfloat> + %2 = bitcast i8 %U to <8 x i1> + %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) #3 + +define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> zeroinitializer, <4 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP9]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP10]] +; +entry: + %0 = bitcast i8 %U to <8 x i1> + %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4 + %3 = bitcast <8 x bfloat> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_mask_cvtneps2bf16_128( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP6]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]] +; CHECK: [[BB11]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB12]]: +; CHECK-NEXT: [[TMP13:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> [[TMP7]], <4 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP14]] +; +entry: + %0 = bitcast i8 %U to <8 x i1> + %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = bitcast <2 x i64> %C to <8 x bfloat> + %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128_select( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP4]], <8 x i16> zeroinitializer, <8 x i16> [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16> +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i16> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i16> [[TMP12]], [[TMP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP13]], <8 x i16> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP4]], <8 x bfloat> [[TMP7]], <8 x bfloat> [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x bfloat> [[TMP14]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP15]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP16]] +; +entry: + %0 = bitcast i8 %U to <8 x i1> + %1 = bitcast <2 x i64> %C to <8 x bfloat> + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4 + %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>) #3 + +define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_dpbf16ps_256( +; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP8]] +; +entry: + %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4 + ret <8 x float> %0 +} + +define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_maskz_dpbf16ps_256( +; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP16]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP17]] +; +entry: + %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer + ret <8 x float> %2 +} +define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_mask_dpbf16ps_256( +; CHECK-SAME: i8 zeroext [[U:%.*]], <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x float> [[E]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP0]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP17]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> [[E]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP18]] +; +entry: + %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E + ret <8 x float> %2 +} + +declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>) #3 + +define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm128_dpbf16ps_128( +; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP8]] +; +entry: + %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4 + ret <4 x float> %0 +} + +define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B, i4 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm128_maskz_dpbf16ps_128( +; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]], i4 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP16]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP17]] +; +entry: + %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4 + %1 = bitcast i4 %U to <4 x i1> + %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer + ret <4 x float> %2 +} +define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm128_mask_dpbf16ps_128( +; CHECK-SAME: i4 zeroext [[U:%.*]], <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i4, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x float> [[E]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP0]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP17]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> [[E]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP18]] +; +entry: + %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4 + %1 = bitcast i4 %U to <4 x i1> + %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E + ret <4 x float> %2 +} + +define <16 x i16> @test_no_vbroadcast1() sanitize_memory { +; CHECK-LABEL: define <16 x i16> @test_no_vbroadcast1( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <8 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i16> [[TMP2]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) + %1 = bitcast <8 x bfloat> %0 to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <16 x i32> zeroinitializer + ret <16 x i16> %2 +} + +define <16 x bfloat> @test_no_vbroadcast2() nounwind sanitize_memory { +; CHECK-LABEL: define <16 x bfloat> @test_no_vbroadcast2( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x bfloat> [[TMP0]], <8 x bfloat> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x bfloat> [[TMP1]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) + %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> poison, <16 x i32> zeroinitializer + ret <16 x bfloat> %1 +} + +define <16 x i32> @pr83358() sanitize_memory { +; CHECK-LABEL: define <16 x i32> @pr83358( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>) + %2 = bitcast <8 x bfloat> %1 to <4 x i32> + %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <16 x i32> %3 +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. From 19a9de06cd557da2289c07db0228ec8987d26dc4 Mon Sep 17 00:00:00 2001 From: yicuixi <63290259+yicuixi@users.noreply.github.com> Date: Wed, 5 Nov 2025 14:43:40 +0800 Subject: [PATCH 281/313] [clang] Accept empty enum in MSVC compatible C (#159981) Fixes https://github.com/llvm/llvm-project/issues/114402. This patch accept empty enum in C as a microsoft extension and introduce an new warning `-Wmicrosoft-empty-enum`. --------- Signed-off-by: yicuixi <qin_17914@126.com> Co-authored-by: Erich Keane <ekeane@nvidia.com> Co-authored-by: Aaron Ballman <aaron@aaronballman.com> --- clang/docs/LanguageExtensions.rst | 10 ++++++++++ clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Basic/DiagnosticGroups.td | 4 +++- clang/include/clang/Basic/DiagnosticParseKinds.td | 3 +++ clang/lib/Parse/ParseDecl.cpp | 9 +++++++-- clang/test/CodeGen/ms-empty-enum.c | 7 +++++++ clang/test/Parser/ms-empty-enum.c | 6 ++++++ 7 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 clang/test/CodeGen/ms-empty-enum.c create mode 100644 clang/test/Parser/ms-empty-enum.c diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index baa0bbb5ea631..b7c18c2b5c6cf 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -2409,6 +2409,16 @@ those modes. Use ``__has_feature(c_fixed_enum)`` to determine whether support for fixed underlying types is available in C23 and later. +Enumerations with no enumerators +-------------------------------- + +Clang provides support for Microsoft extensions to support enumerations with no enumerators. + +.. code-block:: c++ + + typedef enum empty { } A; + + Interoperability with C++11 lambdas ----------------------------------- diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 3f57ddc92d5e8..32f669f8d70d8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -444,6 +444,7 @@ Bug Fixes in This Version - Fixed a failed assertion with empty filename arguments in ``__has_embed``. (#GH159898) - Fixed a failed assertion with empty filename in ``#embed`` directive. (#GH162951) - Fixed a crash triggered by unterminated ``__has_embed``. (#GH162953) +- Accept empty enumerations in MSVC-compatible C mode. (#GH114402) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 8aa3489a2a62b..1e0321de3f4b6 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1436,6 +1436,7 @@ def MicrosoftDrectveSection : DiagGroup<"microsoft-drectve-section">; def MicrosoftInclude : DiagGroup<"microsoft-include">; def MicrosoftCppMacro : DiagGroup<"microsoft-cpp-macro">; def MicrosoftFixedEnum : DiagGroup<"microsoft-fixed-enum">; +def MicrosoftEmptyEnum : DiagGroup<"microsoft-empty-enum">; def MicrosoftSealed : DiagGroup<"microsoft-sealed">; def MicrosoftAbstract : DiagGroup<"microsoft-abstract">; def MicrosoftUnqualifiedFriend : DiagGroup<"microsoft-unqualified-friend">; @@ -1489,7 +1490,8 @@ def Microsoft : DiagGroup<"microsoft", MicrosoftConstInit, MicrosoftVoidPseudoDtor, MicrosoftAnonTag, MicrosoftCommentPaste, MicrosoftEndOfFile, MicrosoftInitFromPredefined, MicrosoftStringLiteralFromPredefined, - MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction]>; + MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction, + MicrosoftEmptyEnum]>; def ClangClPch : DiagGroup<"clang-cl-pch">; diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index e5e071f43fa75..aa0ccb0c05101 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -116,6 +116,9 @@ def err_enumerator_unnamed_no_def : Error< def ext_ms_c_enum_fixed_underlying_type : Extension< "enumeration types with a fixed underlying type are a Microsoft extension">, InGroup<MicrosoftFixedEnum>; +def ext_ms_c_empty_enum_type : Extension< + "empty enumeration types are a Microsoft extension">, + InGroup<MicrosoftEmptyEnum>; def ext_c23_enum_fixed_underlying_type : Extension< "enumeration types with a fixed underlying type are a C23 extension">, InGroup<C23>; diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 7e4a164e34eda..5fcb659768655 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -5370,8 +5370,13 @@ void Parser::ParseEnumBody(SourceLocation StartLoc, Decl *EnumDecl, T.consumeOpen(); // C does not allow an empty enumerator-list, C++ does [dcl.enum]. - if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus) - Diag(Tok, diag::err_empty_enum); + if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus) { + if (getLangOpts().MicrosoftExt) + Diag(T.getOpenLocation(), diag::ext_ms_c_empty_enum_type) + << SourceRange(T.getOpenLocation(), Tok.getLocation()); + else + Diag(Tok, diag::err_empty_enum); + } SmallVector<Decl *, 32> EnumConstantDecls; SmallVector<SuppressAccessChecks, 32> EnumAvailabilityDiags; diff --git a/clang/test/CodeGen/ms-empty-enum.c b/clang/test/CodeGen/ms-empty-enum.c new file mode 100644 index 0000000000000..6c1c87b756f9a --- /dev/null +++ b/clang/test/CodeGen/ms-empty-enum.c @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -fms-extensions -triple x86_64-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fms-extensions -triple i386-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s + +typedef enum tag1 {} A; + +// CHECK: void @empty_enum(i32 noundef %a) +void empty_enum(A a) {} diff --git a/clang/test/Parser/ms-empty-enum.c b/clang/test/Parser/ms-empty-enum.c new file mode 100644 index 0000000000000..790547af88bab --- /dev/null +++ b/clang/test/Parser/ms-empty-enum.c @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions + +typedef enum tag1 { } A; // expected-warning {{empty enumeration types are a Microsoft extension}} +typedef enum tag2 { } B; // expected-warning {{empty enumeration types are a Microsoft extension}} +typedef enum : unsigned { } C; // expected-warning {{enumeration types with a fixed underlying type are a Microsoft extension}}\ + // expected-warning {{empty enumeration types are a Microsoft extension}} From 304d2ff4d998abac779b11afeaede2146c2c60a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com> Date: Tue, 4 Nov 2025 22:56:19 -0800 Subject: [PATCH 282/313] CodeGen: Record MMOs in finalizeBundle (#166210) This allows more accurate alias analysis to apply at the bundle level. This has a bunch of minor effects in post-RA scheduling that look mostly beneficial to me, all of them in AMDGPU (the Thumb2 change is cosmetic). The pre-existing (and unchanged) test in CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll tests that MIR with a bundle with MMOs can be parsed successfully. v2: - use cloneMergedMemRefs - add another test to explicitly check the MMO bundling behavior v3: - use poison instead of undef to initialize the global variable in the test --- llvm/lib/CodeGen/MIRParser/MIParser.cpp | 2 + llvm/lib/CodeGen/MachineInstrBundle.cpp | 6 + .../GlobalISel/insertelement-stack-lower.ll | 2 +- .../AMDGPU/GlobalISel/store-local.128.ll | 20 +- .../AMDGPU/GlobalISel/vni8-across-blocks.ll | 7 +- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 9954 ++++++++--------- .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 363 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 1503 ++- .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll | 295 +- .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 357 +- .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 370 +- .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 534 +- .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 839 +- .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 756 +- .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 1361 ++- llvm/test/CodeGen/AMDGPU/bf16.ll | 28 +- .../AMDGPU/buffer-fat-pointers-memcpy.ll | 49 +- .../CodeGen/AMDGPU/call-argument-types.ll | 12 +- llvm/test/CodeGen/AMDGPU/ds_write2.ll | 4 +- llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll | 4 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 63 +- llvm/test/CodeGen/AMDGPU/finalizebundle.mir | 52 + .../AMDGPU/gfx-callable-return-types.ll | 153 +- llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll | 4 +- .../CodeGen/AMDGPU/hard-clauses-img-gfx11.mir | 4 +- .../CodeGen/AMDGPU/hard-clauses-img-gfx12.mir | 4 +- ...llvm.amdgcn.ds.gws.barrier-fastregalloc.ll | 21 +- .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll | 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 4 +- .../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll | 8 +- .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 156 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 18 +- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 6 +- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 10 +- llvm/test/CodeGen/AMDGPU/load-global-i8.ll | 15 +- llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 214 +- ...er-buffer-fat-pointers-lastuse-metadata.ll | 31 +- ...uffer-fat-pointers-nontemporal-metadata.ll | 58 +- .../AMDGPU/lower-lds-struct-aa-memcpy.ll | 4 +- .../CodeGen/AMDGPU/lower-lds-struct-aa.ll | 8 +- llvm/test/CodeGen/AMDGPU/max.ll | 2 +- .../CodeGen/AMDGPU/memintrinsic-unroll.ll | 2085 ++-- llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll | 6 +- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 203 +- .../CodeGen/AMDGPU/postra-bundle-memops.mir | 5 +- .../postra-bundle-vimage-vsample-gfx12.mir | 4 +- .../AMDGPU/promote-constOffset-to-imm.ll | 41 +- llvm/test/CodeGen/AMDGPU/scratch-simple.ll | 2370 ++-- .../soft-clause-exceeds-register-budget.ll | 15 +- llvm/test/CodeGen/AMDGPU/spill-agpr.ll | 6 +- .../CodeGen/AMDGPU/spill-scavenge-offset.ll | 22 +- llvm/test/CodeGen/AMDGPU/stack-realign.ll | 2 +- .../Thumb2/mve-vpt-block-fold-vcmp.mir | 45 +- 53 files changed, 10762 insertions(+), 11345 deletions(-) diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 4795d81e3f348..434a579c3be3f 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) { MemOperands.push_back(MemOp); if (Token.isNewlineOrEOF()) break; + if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace)) + break; if (Token.isNot(MIToken::comma)) return error("expected ',' before the next machine memory operand"); lex(); diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index 88d81993fbe55..a8dc614288f20 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -137,6 +137,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, SmallSet<Register, 8> KilledUseSet; SmallSet<Register, 8> UndefUseSet; SmallVector<std::pair<Register, Register>> TiedOperands; + SmallVector<MachineInstr *> MemMIs; for (auto MII = FirstMI; MII != LastMI; ++MII) { // Debug instructions have no effects to track. if (MII->isDebugInstr()) @@ -200,6 +201,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, MIB.setMIFlag(MachineInstr::FrameSetup); if (MII->getFlag(MachineInstr::FrameDestroy)) MIB.setMIFlag(MachineInstr::FrameDestroy); + + if (MII->mayLoadOrStore()) + MemMIs.push_back(&*MII); } for (Register Reg : LocalDefs) { @@ -225,6 +229,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, assert(UseIdx < ExternUses.size()); MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx); } + + MIB->cloneMergedMemRefs(MF, MemMIs); } /// finalizeBundle - Same functionality as the previous finalizeBundle except diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index c2129c20e4543..6076a2eec44bc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -33,7 +33,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: v_mov_b32_e32 v13, s49 ; GCN-NEXT: v_mov_b32_e32 v14, s50 ; GCN-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 @@ -51,6 +50,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:56 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:60 ; GCN-NEXT: v_mov_b32_e32 v0, s52 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64 ; GCN-NEXT: v_mov_b32_e32 v0, s53 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 1812e17800e71..10e83b70a57d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_and_b32 s9, 0xffff, s2 ; GFX10-NEXT: s_lshr_b32 s5, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_lshr_b32 s0, s7, 8 ; GFX10-NEXT: v_mov_b32_e32 v6, s6 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 -; GFX10-NEXT: s_lshr_b32 s1, s9, 8 ; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: v_mov_b32_e32 v9, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 @@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:1 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:5 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 -; GFX10-NEXT: v_mov_b32_e32 v10, s1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-NEXT: s_and_b32 s9, 0xffff, s2 ; GFX10-NEXT: s_lshr_b32 s0, s2, 24 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:7 -; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 -; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: s_lshr_b32 s1, s9, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s3 -; GFX10-NEXT: s_lshr_b32 s1, s3, 16 +; GFX10-NEXT: v_mov_b32_e32 v10, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: s_lshr_b32 s1, s3, 16 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: ds_write_b8 v1, v7 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 +; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: s_lshr_b32 s0, s3, 24 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index b33b8a7d8cd72..4a22a911c60b7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16 -; GFX906-NEXT: s_nop 0 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48 ; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64 ; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80 ; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96 @@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240 +; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 74552a500ac51..746ffcff5667a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -3105,22 +3105,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 @@ -3253,6 +3237,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -3284,14 +3284,13 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -3523,7 +3522,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -3946,8 +3944,24 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -4295,44 +4309,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -4437,6 +4419,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -4542,129 +4540,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 @@ -5113,9 +5111,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5255,15 +5254,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 ; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -5280,12 +5272,23 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32i32_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -5302,9 +5305,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 @@ -5437,7 +5437,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -5493,7 +5492,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(45) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -5508,7 +5507,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(47) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -5520,149 +5519,147 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 @@ -5670,7 +5667,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 @@ -5698,7 +5697,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 @@ -6006,9 +6005,25 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 @@ -6280,22 +6295,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6755,7 +6754,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -6776,10 +6779,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -7416,7 +7415,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -10666,7 +10665,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8 @@ -11599,7 +11598,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s35, v16, 3 ; GFX11-NEXT: v_readlane_b32 s34, v16, 2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8 @@ -11812,13 +11811,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -11979,44 +11991,30 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -12025,11 +12023,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -12632,7 +12630,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -12646,8 +12643,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -13327,13 +13324,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -13470,34 +13479,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -13983,7 +13978,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -14561,13 +14555,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -14709,34 +14717,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -15223,7 +15217,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -16362,7 +16355,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -16395,7 +16388,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -17336,7 +17329,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB14_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -17369,7 +17362,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -18086,24 +18079,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -18114,10 +18096,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -18127,7 +18121,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -18722,13 +18715,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -18956,11 +18949,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -18970,11 +18963,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -18982,6 +18972,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -19190,12 +19182,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -19213,6 +19199,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -19222,7 +19214,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -19235,7 +19227,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -19820,8 +19811,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -20000,16 +19991,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -20036,9 +20029,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -20054,14 +20046,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -20073,10 +20067,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20089,10 +20084,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -20106,17 +20103,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -20132,45 +20134,24 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20221,18 +20202,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -20246,6 +20215,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -20683,7 +20664,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -20716,7 +20697,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -21573,7 +21554,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB15_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -21606,7 +21587,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -21624,7 +21605,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -21657,7 +21638,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -22514,7 +22495,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB15_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -22547,7 +22528,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -23110,10 +23091,25 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 @@ -23292,22 +23288,6 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -26129,7 +26109,10 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -26146,9 +26129,6 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -26714,7 +26694,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -29181,7 +29161,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -29214,7 +29194,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -29247,7 +29227,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -30049,7 +30029,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -30082,7 +30062,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -30115,7 +30095,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -30155,7 +30135,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -30188,7 +30168,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -30221,7 +30201,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -30913,7 +30893,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -30946,7 +30926,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -30979,7 +30959,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -31788,6 +31768,22 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -31807,22 +31803,6 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -32493,22 +32473,6 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -32524,6 +32488,22 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] @@ -34732,7 +34712,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -34765,7 +34745,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -34798,7 +34778,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -34876,7 +34856,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -34909,7 +34889,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -34942,7 +34922,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -35000,6 +34980,10 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -35016,10 +35000,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -35051,14 +35031,13 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB24_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -35103,7 +35082,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB24_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 @@ -35330,6 +35308,22 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 @@ -35356,7 +35350,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -35369,22 +35363,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -36338,7 +36316,13 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -36370,12 +36354,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -36391,7 +36369,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -36608,7 +36585,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -37782,7 +37758,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -37815,7 +37791,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -37848,7 +37824,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -37926,7 +37902,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -37959,7 +37935,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -37992,7 +37968,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -40033,22 +40009,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 @@ -40181,6 +40141,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -40212,14 +40188,13 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB36_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -40451,7 +40426,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB36_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -40874,8 +40848,24 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -41223,44 +41213,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -41365,6 +41323,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -41470,129 +41444,129 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 @@ -42041,9 +42015,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -42183,15 +42158,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 ; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -42208,12 +42176,23 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32f32_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -42230,9 +42209,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 @@ -42365,7 +42341,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -42421,7 +42396,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(45) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -42436,7 +42411,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(47) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -42448,149 +42423,147 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 @@ -42598,7 +42571,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 @@ -42626,7 +42601,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB36_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 @@ -42934,9 +42909,25 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 @@ -43208,22 +43199,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -43666,7 +43641,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -43687,10 +43666,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -44310,7 +44285,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -44770,27 +44745,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 24 -; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8 -; SI-NEXT: v_add_f32_e64 v53, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v52, s22, 1.0 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8 ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v2 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 @@ -44842,24 +44801,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v12 +; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; SI-NEXT: v_add_f32_e64 v53, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v52, s22, 1.0 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v16 +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v21 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v21 +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v26 @@ -44868,6 +44836,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v26 ; SI-NEXT: v_add_f32_e64 v41, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v40, s20, 1.0 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v26 @@ -44875,6 +44845,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v57, s16, 1.0 ; SI-NEXT: v_add_f32_e64 v46, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v45, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8 ; SI-NEXT: v_lshr_b64 v[31:32], v[40:41], 16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -44885,6 +44856,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: v_lshr_b64 v[27:28], v[40:41], 24 ; SI-NEXT: v_lshr_b64 v[33:34], v[45:46], 24 ; SI-NEXT: v_lshr_b64 v[38:39], v[45:46], 8 @@ -45408,33 +45381,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s98 +; SI-NEXT: v_mov_b32_e32 v27, s62 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s46 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s56 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v13, s58 -; SI-NEXT: v_mov_b32_e32 v27, s62 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v13, s46 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s72 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v13, s56 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s74 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v13, s58 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s76 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v57, s16 @@ -45468,6 +45441,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: v_readlane_b32 s5, v61, 1 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v13, s60 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s78 @@ -45694,9 +45668,22 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v29 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload @@ -45809,17 +45796,16 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 @@ -46070,19 +46056,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -46687,6 +46660,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: s_branch .LBB37_2 ; VI-NEXT: .LBB37_4: +; VI-NEXT: v_mov_b32_e32 v53, s46 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s56 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 0 ; VI-NEXT: v_mov_b32_e32 v48, s4 @@ -46764,6 +46741,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s58 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 27 @@ -46841,6 +46821,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 51 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s60 ; VI-NEXT: v_readlane_b32 s4, v62, 52 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 @@ -46859,40 +46842,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 57 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 -; VI-NEXT: v_mov_b32_e32 v53, s46 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s56 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s58 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s60 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s62 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s72 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s74 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s76 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s78 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s88 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s90 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, s16 ; VI-NEXT: v_mov_b32_e32 v32, s17 ; VI-NEXT: v_mov_b32_e32 v29, s18 @@ -46946,11 +46895,35 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v42, s82 ; VI-NEXT: v_mov_b32_e32 v37, s81 ; VI-NEXT: v_mov_b32_e32 v50, s80 -; VI-NEXT: v_mov_b32_e32 v53, s30 -; VI-NEXT: v_mov_b32_e32 v54, s34 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s36 ; VI-NEXT: v_mov_b32_e32 v40, s38 ; VI-NEXT: v_mov_b32_e32 v41, s48 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s62 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s72 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s74 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s76 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s78 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s88 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s90 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s30 +; VI-NEXT: v_mov_b32_e32 v54, s34 ; VI-NEXT: .LBB37_5: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 ; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 @@ -47018,6 +46991,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v50 ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v36 +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s87, v63, 31 ; VI-NEXT: v_readlane_b32 s86, v63, 30 ; VI-NEXT: v_readlane_b32 s85, v63, 29 @@ -47050,7 +47037,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0 @@ -47341,20 +47328,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload @@ -48123,10 +48096,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 49 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, s4 -; GFX9-NEXT: v_mov_b32_e32 v49, s52 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, s46 -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -48175,6 +48146,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: v_mov_b32_e32 v49, s52 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill @@ -48222,6 +48194,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v54, s55 ; GFX9-NEXT: v_mov_b32_e32 v50, s53 ; GFX9-NEXT: v_mov_b32_e32 v60, s54 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v49, s51 ; GFX9-NEXT: v_mov_b32_e32 v59, s50 ; GFX9-NEXT: v_mov_b32_e32 v58, s49 @@ -48291,6 +48264,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v49 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s99, v63, 35 ; GFX9-NEXT: v_readlane_b32 s98, v63, 34 ; GFX9-NEXT: v_readlane_b32 s97, v63, 33 @@ -48327,7 +48314,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 @@ -48621,20 +48608,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload @@ -48646,7 +48619,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 @@ -48681,7 +48654,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 vcc_hi, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 @@ -49601,7 +49574,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v74, off, s32 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 @@ -49663,7 +49636,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_readlane_b32 s31, v75, 1 ; GFX11-NEXT: v_readlane_b32 s30, v75, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 @@ -49876,13 +49849,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -50043,44 +50029,30 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -50089,11 +50061,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -50696,7 +50668,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -50710,8 +50681,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -51391,13 +51362,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -51534,34 +51517,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -52047,7 +52016,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -52625,13 +52593,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -52773,34 +52755,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -53287,7 +53255,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -54426,7 +54393,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -54459,7 +54426,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -55400,7 +55367,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB38_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -55433,7 +55400,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -56150,24 +56117,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB39_3 ; SI-NEXT: .LBB39_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -56178,10 +56134,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB39_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -56191,7 +56159,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -56786,13 +56753,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -57020,11 +56987,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -57034,11 +57001,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -57046,6 +57010,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -57254,12 +57220,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB39_3 ; VI-NEXT: .LBB39_2: -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -57277,6 +57237,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -57286,7 +57252,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -57299,7 +57265,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -57884,8 +57849,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -58064,16 +58029,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -58100,9 +58067,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -58118,14 +58084,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -58137,10 +58105,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -58153,10 +58122,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -58170,17 +58141,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -58196,45 +58172,24 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -58285,18 +58240,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB39_3 ; GFX9-NEXT: .LBB39_2: -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -58310,6 +58253,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -58747,7 +58702,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -58780,7 +58735,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -59637,7 +59592,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB39_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -59670,7 +59625,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -59688,7 +59643,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -59721,7 +59676,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -60578,7 +60533,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB39_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -60611,7 +60566,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -61174,10 +61129,25 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 @@ -61356,22 +61326,6 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -62140,6 +62094,20 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -62176,10 +62144,9 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 @@ -62193,20 +62160,6 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -64239,7 +64192,10 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -64256,9 +64212,6 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -64824,7 +64777,7 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -67291,7 +67244,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -67324,7 +67277,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -67357,7 +67310,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -68159,7 +68112,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -68192,7 +68145,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -68225,7 +68178,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -68265,7 +68218,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -68298,7 +68251,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -68331,7 +68284,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -69023,7 +68976,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -69056,7 +69009,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -69089,7 +69042,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -69898,6 +69851,22 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -69917,22 +69886,6 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -70603,11 +70556,6 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -70624,6 +70572,11 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: @@ -72813,7 +72766,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -72846,7 +72799,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -72879,7 +72832,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -72957,7 +72910,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -72990,7 +72943,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -73023,7 +72976,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -73081,6 +73034,10 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -73097,10 +73054,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -73132,14 +73085,13 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -73184,7 +73136,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 @@ -73411,6 +73362,22 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 @@ -73437,7 +73404,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -73450,22 +73417,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -73951,9 +73902,25 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 @@ -73979,22 +73946,6 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: @@ -74373,7 +74324,13 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -74405,12 +74362,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -74426,7 +74377,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -74643,7 +74593,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -75817,7 +75766,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -75850,7 +75799,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -75883,7 +75832,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -75961,7 +75910,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -75994,7 +75943,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -76027,7 +75976,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -77054,22 +77003,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 @@ -77202,6 +77135,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -77233,14 +77182,13 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -77501,7 +77449,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 @@ -77895,8 +77842,24 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload @@ -78244,44 +78207,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -78386,6 +78317,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -78491,129 +78438,129 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 @@ -79062,9 +79009,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79204,15 +79152,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 ; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -79229,12 +79170,23 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16i64_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -79251,9 +79203,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 @@ -79386,7 +79335,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -79442,7 +79390,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(45) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -79457,7 +79405,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(47) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -79469,149 +79417,147 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 @@ -79619,7 +79565,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 @@ -79676,7 +79624,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v28, vcc ; GFX9-NEXT: v_add_co_u32_e32 v29, vcc, 3, v29 ; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, 0, v30, vcc -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_add_co_u32_e32 v31, vcc, 3, v31 ; GFX9-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v32, vcc ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -79955,9 +79903,25 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 @@ -80229,22 +80193,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -80712,7 +80660,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -80733,10 +80685,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -81381,7 +81329,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -84631,7 +84579,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8 @@ -85566,7 +85514,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s35, v16, 3 ; GFX11-NEXT: v_readlane_b32 s34, v16, 2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8 @@ -85779,13 +85727,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -85946,44 +85907,30 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -85992,11 +85939,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -86599,7 +86546,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -86613,8 +86559,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -87294,13 +87240,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -87437,34 +87395,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -87950,7 +87894,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -88528,13 +88471,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -88676,34 +88633,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -89190,7 +89133,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -90329,7 +90271,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -90362,7 +90304,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -91303,7 +91245,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB58_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -91336,7 +91278,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -92053,24 +91995,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB59_3 ; SI-NEXT: .LBB59_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -92081,10 +92012,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB59_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -92094,7 +92037,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -92689,13 +92631,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -92923,11 +92865,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -92937,11 +92879,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -92949,6 +92888,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -93157,12 +93098,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB59_3 ; VI-NEXT: .LBB59_2: -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -93180,6 +93115,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -93189,7 +93130,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -93202,7 +93143,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -93787,8 +93727,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -93967,16 +93907,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -94003,9 +93945,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -94021,14 +93962,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -94040,10 +93983,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -94056,10 +94000,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -94073,17 +94019,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -94099,45 +94050,24 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -94188,18 +94118,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB59_3 ; GFX9-NEXT: .LBB59_2: -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -94213,6 +94131,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -94650,7 +94580,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -94683,7 +94613,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -95540,7 +95470,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB59_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -95573,7 +95503,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -95591,7 +95521,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -95624,7 +95554,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -96481,7 +96411,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB59_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -96514,7 +96444,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -97078,10 +97008,25 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 @@ -97260,22 +97205,6 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -100084,7 +100013,10 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -100101,9 +100033,6 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -100669,7 +100598,7 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -103136,7 +103065,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -103169,7 +103098,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -103202,7 +103131,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -104004,7 +103933,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -104037,7 +103966,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -104070,7 +103999,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -104110,7 +104039,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -104143,7 +104072,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -104176,7 +104105,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -104868,7 +104797,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -104901,7 +104830,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -104934,7 +104863,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -105740,6 +105669,22 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -105759,22 +105704,6 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -106453,22 +106382,6 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -106484,6 +106397,22 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] @@ -108700,7 +108629,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -108733,7 +108662,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -108766,7 +108695,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -108844,7 +108773,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -108877,7 +108806,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -108910,7 +108839,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -108968,6 +108897,10 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -108984,10 +108917,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -109019,14 +108948,13 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB68_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -109099,7 +109027,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 @@ -109296,6 +109223,22 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 @@ -109322,7 +109265,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -109335,22 +109278,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -110320,7 +110247,13 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -110352,12 +110285,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -110373,7 +110300,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -110590,7 +110516,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -111764,7 +111689,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -111797,7 +111722,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -111830,7 +111755,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -111908,7 +111833,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -111941,7 +111866,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -111974,7 +111899,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -112032,22 +111957,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -112180,6 +112089,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 @@ -112211,14 +112136,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -112449,7 +112373,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24 @@ -112827,8 +112750,24 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload @@ -113206,44 +113145,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -113346,6 +113253,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -113448,132 +113371,132 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 @@ -114009,9 +113932,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -114151,17 +114075,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v42 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -114178,12 +114092,26 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16f64_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -114200,9 +114128,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; kill: killed $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr41 @@ -114335,7 +114260,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; kill: killed $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill @@ -114395,7 +114319,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: s_waitcnt vmcnt(47) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -114408,7 +114332,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: s_waitcnt vmcnt(49) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -114416,152 +114340,151 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill @@ -114571,6 +114494,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9 @@ -114599,7 +114523,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB72_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: s_waitcnt vmcnt(46) ; GFX9-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 @@ -114904,8 +114828,24 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115170,22 +115110,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -115628,7 +115552,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -115649,10 +115577,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -116272,7 +116196,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -117056,6 +116980,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v33, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 39 ; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_mov_b32_e32 v29, s46 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s98 ; SI-NEXT: v_readlane_b32 s4, v61, 40 ; SI-NEXT: v_mov_b32_e32 v34, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 41 @@ -117148,6 +117077,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s96 ; SI-NEXT: v_readlane_b32 s4, v62, 0 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -117204,20 +117137,69 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_mov_b32_e32 v29, s46 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s98 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s96 +; SI-NEXT: v_readlane_b32 s4, v62, 14 +; SI-NEXT: v_mov_b32_e32 v60, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 15 +; SI-NEXT: v_mov_b32_e32 v31, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 16 +; SI-NEXT: v_mov_b32_e32 v32, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 17 +; SI-NEXT: v_mov_b32_e32 v18, s5 +; SI-NEXT: v_mov_b32_e32 v46, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 0 +; SI-NEXT: v_readlane_b32 s5, v61, 1 +; SI-NEXT: v_mov_b32_e32 v59, s17 +; SI-NEXT: v_mov_b32_e32 v58, s16 +; SI-NEXT: v_mov_b32_e32 v45, s19 +; SI-NEXT: v_mov_b32_e32 v44, s18 +; SI-NEXT: v_mov_b32_e32 v53, s21 +; SI-NEXT: v_mov_b32_e32 v52, s20 +; SI-NEXT: v_mov_b32_e32 v39, s23 +; SI-NEXT: v_mov_b32_e32 v38, s22 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v16, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v14, s9 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s86 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v12, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v8, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v5, s40 +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v3, s42 +; SI-NEXT: v_mov_b32_e32 v2, s45 +; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v28, s38 +; SI-NEXT: v_mov_b32_e32 v27, s36 +; SI-NEXT: v_mov_b32_e32 v26, s34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v25, s30 +; SI-NEXT: v_mov_b32_e32 v56, s94 +; SI-NEXT: v_mov_b32_e32 v55, s92 +; SI-NEXT: v_mov_b32_e32 v54, s90 +; SI-NEXT: v_mov_b32_e32 v42, s88 +; SI-NEXT: v_mov_b32_e32 v41, s78 +; SI-NEXT: v_mov_b32_e32 v40, s76 +; SI-NEXT: v_mov_b32_e32 v50, s74 +; SI-NEXT: v_mov_b32_e32 v49, s72 +; SI-NEXT: v_mov_b32_e32 v48, s62 +; SI-NEXT: v_mov_b32_e32 v47, s60 +; SI-NEXT: v_mov_b32_e32 v36, s58 +; SI-NEXT: v_mov_b32_e32 v35, s56 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) @@ -117260,165 +117242,108 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v29, s50 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s4, v62, 14 -; SI-NEXT: v_mov_b32_e32 v60, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 15 -; SI-NEXT: v_mov_b32_e32 v31, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 16 -; SI-NEXT: v_mov_b32_e32 v32, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 17 -; SI-NEXT: v_mov_b32_e32 v18, s5 -; SI-NEXT: v_mov_b32_e32 v46, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 0 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 2 +; SI-NEXT: v_readlane_b32 s5, v61, 3 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 1 -; SI-NEXT: v_readlane_b32 s4, v61, 2 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 4 +; SI-NEXT: v_readlane_b32 s5, v61, 5 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 3 -; SI-NEXT: v_readlane_b32 s4, v61, 4 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 6 +; SI-NEXT: v_readlane_b32 s5, v61, 7 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 5 -; SI-NEXT: v_readlane_b32 s4, v61, 6 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 8 +; SI-NEXT: v_readlane_b32 s5, v61, 9 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 7 -; SI-NEXT: v_readlane_b32 s4, v61, 8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 10 +; SI-NEXT: v_readlane_b32 s5, v61, 11 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 9 -; SI-NEXT: v_readlane_b32 s4, v61, 10 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 12 +; SI-NEXT: v_readlane_b32 s5, v61, 13 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 11 -; SI-NEXT: v_readlane_b32 s4, v61, 12 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 14 +; SI-NEXT: v_readlane_b32 s5, v61, 15 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 13 -; SI-NEXT: v_readlane_b32 s4, v61, 14 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 16 +; SI-NEXT: v_readlane_b32 s5, v61, 17 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 15 -; SI-NEXT: v_readlane_b32 s4, v61, 16 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 18 +; SI-NEXT: v_readlane_b32 s5, v61, 19 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 17 -; SI-NEXT: v_readlane_b32 s4, v61, 18 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 20 +; SI-NEXT: v_readlane_b32 s5, v61, 21 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 19 -; SI-NEXT: v_readlane_b32 s4, v61, 20 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 22 +; SI-NEXT: v_readlane_b32 s5, v61, 23 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 21 -; SI-NEXT: v_readlane_b32 s4, v61, 22 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 24 +; SI-NEXT: v_readlane_b32 s5, v61, 25 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 23 -; SI-NEXT: v_readlane_b32 s4, v61, 24 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 26 +; SI-NEXT: v_readlane_b32 s5, v61, 27 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 25 -; SI-NEXT: v_readlane_b32 s4, v61, 26 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 28 +; SI-NEXT: v_readlane_b32 s5, v61, 29 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 27 -; SI-NEXT: v_readlane_b32 s4, v61, 28 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 30 +; SI-NEXT: v_readlane_b32 s5, v61, 31 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 29 -; SI-NEXT: v_readlane_b32 s4, v61, 30 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 32 +; SI-NEXT: v_readlane_b32 s5, v61, 33 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s48 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_readlane_b32 s5, v61, 31 -; SI-NEXT: v_readlane_b32 s4, v61, 32 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_mov_b32_e32 v59, s17 -; SI-NEXT: v_mov_b32_e32 v58, s16 -; SI-NEXT: v_mov_b32_e32 v45, s19 -; SI-NEXT: v_mov_b32_e32 v44, s18 -; SI-NEXT: v_mov_b32_e32 v53, s21 -; SI-NEXT: v_mov_b32_e32 v52, s20 -; SI-NEXT: v_mov_b32_e32 v39, s23 -; SI-NEXT: v_mov_b32_e32 v38, s22 -; SI-NEXT: v_mov_b32_e32 v24, s25 -; SI-NEXT: v_mov_b32_e32 v23, s24 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_mov_b32_e32 v16, s7 -; SI-NEXT: v_mov_b32_e32 v15, s6 -; SI-NEXT: v_mov_b32_e32 v14, s9 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v12, s11 -; SI-NEXT: v_mov_b32_e32 v11, s10 -; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v8, s15 -; SI-NEXT: v_mov_b32_e32 v7, s14 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v5, s40 -; SI-NEXT: v_mov_b32_e32 v4, s43 -; SI-NEXT: v_mov_b32_e32 v3, s42 -; SI-NEXT: v_mov_b32_e32 v2, s45 -; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: v_mov_b32_e32 v28, s38 -; SI-NEXT: v_mov_b32_e32 v27, s36 -; SI-NEXT: v_mov_b32_e32 v26, s34 -; SI-NEXT: v_mov_b32_e32 v25, s30 -; SI-NEXT: v_mov_b32_e32 v56, s94 -; SI-NEXT: v_mov_b32_e32 v55, s92 -; SI-NEXT: v_mov_b32_e32 v54, s90 -; SI-NEXT: v_mov_b32_e32 v42, s88 -; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v40, s76 -; SI-NEXT: v_mov_b32_e32 v50, s74 -; SI-NEXT: v_mov_b32_e32 v49, s72 -; SI-NEXT: v_mov_b32_e32 v48, s62 -; SI-NEXT: v_mov_b32_e32 v47, s60 -; SI-NEXT: v_mov_b32_e32 v36, s58 -; SI-NEXT: v_mov_b32_e32 v35, s56 -; SI-NEXT: v_readlane_b32 s5, v61, 33 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: .LBB73_5: ; %end @@ -117711,9 +117636,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 @@ -118002,15 +117927,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -118024,6 +117940,15 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload @@ -118690,6 +118615,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 11 ; VI-NEXT: v_mov_b32_e32 v41, s4 +; VI-NEXT: v_mov_b32_e32 v40, s48 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s38 ; VI-NEXT: v_readlane_b32 s4, v62, 12 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 @@ -118727,6 +118656,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 25 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s36 ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill @@ -118764,6 +118696,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 37 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s34 ; VI-NEXT: v_readlane_b32 s4, v62, 38 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 @@ -118779,52 +118714,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 42 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_mov_b32_e32 v40, s48 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s38 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s36 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s34 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s30 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s90 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s88 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s78 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s76 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s74 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s72 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s62 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s60 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s58 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s56 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 43 ; VI-NEXT: v_mov_b32_e32 v53, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 44 @@ -118834,6 +118723,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 46 ; VI-NEXT: v_mov_b32_e32 v58, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 48 ; VI-NEXT: v_mov_b32_e32 v54, s4 @@ -118846,17 +118736,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 52 ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 53 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s30 ; VI-NEXT: v_mov_b32_e32 v49, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 54 ; VI-NEXT: v_mov_b32_e32 v61, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 55 ; VI-NEXT: v_mov_b32_e32 v36, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 -; VI-NEXT: v_mov_b32_e32 v40, s46 ; VI-NEXT: v_mov_b32_e32 v55, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v12, s5 ; VI-NEXT: v_mov_b32_e32 v1, s44 ; VI-NEXT: v_mov_b32_e32 v2, s45 @@ -118886,13 +118776,48 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v28, s21 ; VI-NEXT: v_mov_b32_e32 v29, s18 ; VI-NEXT: v_mov_b32_e32 v30, s19 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s90 ; VI-NEXT: v_mov_b32_e32 v31, s16 ; VI-NEXT: v_mov_b32_e32 v32, s17 ; VI-NEXT: v_mov_b32_e32 v42, s70 ; VI-NEXT: v_mov_b32_e32 v50, s4 -; VI-NEXT: v_mov_b32_e32 v40, v43 ; VI-NEXT: v_mov_b32_e32 v46, v38 ; VI-NEXT: v_mov_b32_e32 v38, v34 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s88 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s78 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s76 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s74 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s72 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s62 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s58 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s56 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s46 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, v43 ; VI-NEXT: .LBB73_5: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 ; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -119291,10 +119216,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -119309,6 +119231,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -119906,6 +119831,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: s_branch .LBB73_2 ; GFX9-NEXT: .LBB73_4: +; GFX9-NEXT: v_mov_b32_e32 v41, s66 +; GFX9-NEXT: v_mov_b32_e32 v40, s36 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s34 ; GFX9-NEXT: v_mov_b32_e32 v15, s81 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s71 @@ -119982,6 +119913,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 9 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s30 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 10 @@ -120040,71 +119975,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 28 ; GFX9-NEXT: v_mov_b32_e32 v29, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 29 -; GFX9-NEXT: v_mov_b32_e32 v41, s66 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_mov_b32_e32 v40, s36 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s34 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s30 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s94 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s92 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s90 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s88 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s78 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s76 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s74 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s72 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s62 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s60 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s58 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s56 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 31 ; GFX9-NEXT: v_mov_b32_e32 v44, s4 @@ -120119,6 +119993,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 36 ; GFX9-NEXT: v_mov_b32_e32 v55, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 37 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s94 ; GFX9-NEXT: v_mov_b32_e32 v61, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 38 ; GFX9-NEXT: v_mov_b32_e32 v42, s4 @@ -120143,7 +120021,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 48 ; GFX9-NEXT: v_mov_b32_e32 v60, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 49 -; GFX9-NEXT: v_mov_b32_e32 v40, s46 ; GFX9-NEXT: v_mov_b32_e32 v12, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s44 ; GFX9-NEXT: v_mov_b32_e32 v2, s45 @@ -120181,6 +120058,54 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v54, s64 ; GFX9-NEXT: v_mov_b32_e32 v52, s54 ; GFX9-NEXT: v_mov_b32_e32 v25, s4 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s92 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s90 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s88 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s78 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s76 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s74 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s72 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s62 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s58 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s56 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s46 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -120202,6 +120127,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v45 ; GFX9-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v56 ; GFX9-NEXT: v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 @@ -120252,46 +120179,45 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v44 ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v51 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v59 ; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v49 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120305,9 +120231,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24 @@ -120319,9 +120247,23 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:32 @@ -120343,10 +120285,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120574,20 +120513,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -120599,7 +120524,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 ; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 @@ -120634,7 +120559,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: v_writelane_b32 v77, s101, 5 ; GFX11-NEXT: s_mov_b32 vcc_hi, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 @@ -121542,7 +121467,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v75, off, s32 ; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:8 @@ -121605,7 +121530,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: v_readlane_b32 s31, v76, 1 ; GFX11-NEXT: v_readlane_b32 s30, v76, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 ; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 @@ -121818,13 +121743,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -121985,44 +121923,30 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -122031,11 +121955,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -122638,7 +122562,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -122652,8 +122575,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -123333,13 +123256,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -123476,34 +123411,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -123989,7 +123910,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -124567,13 +124487,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -124715,34 +124649,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -125229,7 +125149,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -126368,7 +126287,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -126401,7 +126320,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -127342,7 +127261,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB74_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -127375,7 +127294,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 +; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -128092,24 +128011,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB75_3 ; SI-NEXT: .LBB75_2: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -128120,10 +128028,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB75_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -128133,7 +128053,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -128728,13 +128647,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -128962,11 +128881,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -128976,11 +128895,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -128988,6 +128904,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -129196,12 +129114,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB75_3 ; VI-NEXT: .LBB75_2: -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -129219,6 +129131,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -129228,7 +129146,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -129241,7 +129159,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -129826,8 +129743,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -130006,16 +129923,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -130042,9 +129961,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -130060,14 +129978,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -130079,10 +129999,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -130095,10 +130016,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -130112,17 +130035,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -130138,45 +130066,24 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) ; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -130227,18 +130134,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB75_3 ; GFX9-NEXT: .LBB75_2: -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -130252,6 +130147,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -130689,7 +130596,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -130722,7 +130629,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -131579,7 +131486,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB75_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -131612,7 +131519,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -131630,7 +131537,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -131663,7 +131570,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -132520,7 +132427,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB75_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -132553,7 +132460,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 +; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -132588,22 +132495,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v64bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -132672,6 +132563,22 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -132703,7 +132610,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -132713,7 +132620,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -132843,7 +132750,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 @@ -133081,10 +132987,25 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 @@ -133263,22 +133184,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -133966,8 +133871,22 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -134055,20 +133974,6 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload @@ -136071,7 +135976,10 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -136088,9 +135996,6 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -136656,7 +136561,7 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -139123,7 +139028,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -139156,7 +139061,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -139189,7 +139094,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -139991,7 +139896,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -140024,7 +139929,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -140057,7 +139962,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 +; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -140097,7 +140002,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -140130,7 +140035,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -140163,7 +140068,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -140855,7 +140760,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -140888,7 +140793,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -140921,7 +140826,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 +; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -140978,22 +140883,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -141062,6 +140951,22 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr47 @@ -141093,7 +140998,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -141144,7 +141049,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 @@ -141314,7 +141218,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 @@ -141662,8 +141565,24 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -141712,22 +141631,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -142372,6 +142275,22 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -142391,22 +142310,6 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB81_4: @@ -144567,7 +144470,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -144600,7 +144503,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -144633,7 +144536,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -144711,7 +144614,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -144744,7 +144647,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -144777,7 +144680,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -144835,6 +144738,10 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -144851,10 +144758,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -144886,14 +144789,13 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB84_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -144937,7 +144839,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB84_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 @@ -145149,6 +145050,22 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 @@ -145175,7 +145092,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -145188,22 +145105,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -145607,7 +145508,23 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 @@ -145668,22 +145585,6 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB85_4: @@ -146031,7 +145932,13 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -146063,12 +145970,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -146084,7 +145985,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -146301,7 +146201,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -147475,7 +147374,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -147508,7 +147407,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -147541,7 +147440,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -147619,7 +147518,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -147652,7 +147551,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -147685,7 +147584,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -147895,6 +147794,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -147904,7 +147805,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 @@ -147944,38 +147845,39 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 @@ -147991,11 +147893,12 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -148017,14 +147920,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 @@ -148032,11 +147927,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 @@ -148045,9 +147944,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:328 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v2 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 @@ -148057,7 +147958,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:360 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v1 @@ -149557,10 +149458,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -149857,22 +149773,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -149940,8 +149840,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 @@ -150037,13 +149937,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -150171,14 +150083,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150186,26 +150103,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -150214,35 +150111,57 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150275,39 +150194,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150473,17 +150372,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -151168,8 +151059,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 @@ -151280,13 +151171,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -151419,14 +151324,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151434,26 +151344,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -151462,36 +151352,62 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151514,49 +151430,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151722,17 +151614,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 @@ -153078,7 +152962,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 @@ -153111,7 +152995,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 @@ -153940,7 +153824,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB88_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 @@ -153973,7 +153857,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 @@ -154018,7 +153902,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 @@ -154029,7 +153912,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v43, s19, 0 ; SI-NEXT: v_writelane_b32 v43, s18, 1 ; SI-NEXT: v_writelane_b32 v43, s17, 2 @@ -154070,10 +153953,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s86, 30 ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 +; SI-NEXT: s_mov_b32 s79, s26 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: v_writelane_b32 v41, s99, 35 -; SI-NEXT: s_mov_b32 s79, s26 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; SI-NEXT: v_readfirstlane_b32 s38, v20 ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s39, v19 @@ -154100,9 +153989,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s18, v5 ; SI-NEXT: v_readfirstlane_b32 s19, v6 ; SI-NEXT: v_readfirstlane_b32 s88, v4 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_readfirstlane_b32 s90, v9 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s6, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 @@ -154110,33 +153997,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: v_writelane_b32 v43, s4, 4 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; SI-NEXT: v_writelane_b32 v43, s4, 5 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v43, s4, 6 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v43, s4, 7 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v36 ; SI-NEXT: v_writelane_b32 v43, s4, 8 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 ; SI-NEXT: v_writelane_b32 v43, s4, 9 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: v_writelane_b32 v43, s4, 10 +; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: v_readfirstlane_b32 s90, v9 ; SI-NEXT: v_readfirstlane_b32 s91, v10 ; SI-NEXT: v_readfirstlane_b32 s92, v8 ; SI-NEXT: v_readfirstlane_b32 s93, v7 @@ -154219,44 +154104,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s24, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s78, v34 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 ; SI-NEXT: v_writelane_b32 v43, s4, 19 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: v_writelane_b32 v43, s4, 20 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 21 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: v_writelane_b32 v43, s4, 22 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v39 ; SI-NEXT: v_writelane_b32 v43, s4, 23 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s4, v48 ; SI-NEXT: v_writelane_b32 v43, s4, 24 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s4, v49 ; SI-NEXT: v_writelane_b32 v43, s4, 25 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v50 ; SI-NEXT: v_writelane_b32 v43, s4, 26 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: v_writelane_b32 v43, s4, 27 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 @@ -154270,7 +154152,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; SI-NEXT: v_writelane_b32 v43, s4, 28 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v52 ; SI-NEXT: v_writelane_b32 v43, s4, 29 ; SI-NEXT: v_readfirstlane_b32 s4, v53 @@ -154279,7 +154161,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v43, s4, 31 ; SI-NEXT: v_readfirstlane_b32 s4, v55 ; SI-NEXT: v_writelane_b32 v43, s4, 32 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v40 ; SI-NEXT: v_writelane_b32 v43, s4, 33 ; SI-NEXT: v_writelane_b32 v43, s22, 34 @@ -155894,33 +155775,53 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -155965,52 +155866,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -156030,6 +155885,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill @@ -156038,7 +155894,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill @@ -156070,6 +155925,25 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB89_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload @@ -156094,15 +155968,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload @@ -156152,10 +156029,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -156163,50 +156041,37 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v42, v43 ; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -156221,13 +156086,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -156249,21 +156113,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v63, v39 +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -156281,11 +156152,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -156318,7 +156188,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_branch .LBB89_3 ; VI-NEXT: .LBB89_2: ; VI-NEXT: v_mov_b32_e32 v47, v54 -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -156339,6 +156208,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v58, v7 ; VI-NEXT: v_mov_b32_e32 v57, v5 ; VI-NEXT: v_mov_b32_e32 v56, v3 @@ -156930,29 +156800,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -157016,82 +156908,42 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -157112,6 +156964,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(55) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB89_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -157365,14 +157224,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -157382,7 +157240,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: .LBB89_2: ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -157394,6 +157251,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v49, v39 @@ -157859,7 +157717,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -158589,7 +158447,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l ; GFX11-TRUE16-NEXT: .LBB89_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -158631,7 +158489,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -159415,7 +159273,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB89_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -161437,7 +161295,23 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload @@ -161484,28 +161358,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -161522,9 +161383,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr34 @@ -161713,166 +161571,165 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2] -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18 ; VI-NEXT: v_mov_b32_e32 v45, v46 ; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17 ; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v46, v63 ; VI-NEXT: v_mov_b32_e32 v63, v50 ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_mov_b32_e32 v51, v57 ; VI-NEXT: v_mov_b32_e32 v50, v56 ; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 ; VI-NEXT: v_mov_b32_e32 v57, v43 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18] ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 @@ -161885,6 +161742,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 @@ -162518,27 +162376,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26 ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 @@ -162923,9 +162781,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -163020,16 +162879,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -163046,6 +162895,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -163282,49 +163141,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill @@ -163338,6 +163159,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62 @@ -163355,130 +163177,168 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v6 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[62:63] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill @@ -163571,16 +163431,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 -; GFX9-NEXT: v_mov_b32_e32 v59, v32 ; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v21 -; GFX9-NEXT: v_mov_b32_e32 v58, v31 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc @@ -163735,7 +163590,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: s_waitcnt vmcnt(52) +; GFX9-NEXT: s_waitcnt vmcnt(50) ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v62 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v44, v18, v19, vcc @@ -163750,7 +163605,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 @@ -163891,8 +163745,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v9 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_mov_b32_e32 v59, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v58, v31 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -163958,6 +163814,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v61, v28, v0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v41, vcc ; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6 @@ -163965,7 +163822,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc ; GFX9-NEXT: v_perm_b32 v41, v13, v0, s7 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 @@ -163994,24 +163851,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v31, v45, vcc +; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7 ; GFX9-NEXT: v_perm_b32 v31, v15, v26, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16 -; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill @@ -164031,12 +163878,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v36, v44, v29, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v29 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22 ; GFX9-NEXT: v_perm_b32 v38, v21, v43, s7 @@ -164045,6 +163899,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v51, v6, v17, s7 ; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 @@ -164052,12 +163924,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v57 -; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v47 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -164101,7 +163969,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v56 ; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill @@ -164134,74 +164001,51 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[35:36] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32 ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v42 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v42 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v41 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v55 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v53 @@ -164214,15 +164058,26 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44] ; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 ; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill @@ -164231,6 +164086,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v60 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v60 +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill @@ -164255,31 +164113,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v63, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v63, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: v_mov_b32_e32 v62, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v34 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43 ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v43 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v59 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v58 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v60 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v61 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v60 @@ -164294,6 +164154,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32 ; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164302,6 +164166,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51 @@ -164310,38 +164176,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v12 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v14, 8, v14 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v60 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -164516,13 +164373,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v55 -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:76 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:80 @@ -164533,13 +164390,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:84 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:88 @@ -164679,7 +164536,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:240 @@ -164712,7 +164573,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1b +; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:112 @@ -164741,10 +164602,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:12 -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 @@ -165778,7 +165635,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:20 @@ -165811,7 +165668,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:136 -; GFX11-TRUE16-NEXT: s_clause 0x1b +; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:140 ; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:148 @@ -165846,7 +165703,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x15 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:88 @@ -165869,10 +165730,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -166991,7 +166848,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x15 +; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20 @@ -168663,13 +168520,26 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s5, s86, 24 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s86, v63, 30 ; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -168708,20 +168578,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -169924,6 +169780,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v43, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 13 ; VI-NEXT: v_mov_b32_e32 v46, s4 +; VI-NEXT: v_mov_b32_e32 v45, s72 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, s74 +; VI-NEXT: v_mov_b32_e32 v42, s54 +; VI-NEXT: v_mov_b32_e32 v41, s46 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s56 ; VI-NEXT: v_readlane_b32 s4, v62, 14 ; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 15 @@ -169949,6 +169814,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readlane_b32 s4, v62, 22 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, s76 ; VI-NEXT: v_readlane_b32 s4, v62, 23 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 @@ -169994,6 +169864,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readlane_b32 s4, v62, 37 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 38 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 @@ -170052,45 +169924,47 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: v_mov_b32_e32 v42, s54 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_mov_b32_e32 v41, s46 +; VI-NEXT: v_mov_b32_e32 v36, s66 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s56 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s58 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s60 -; VI-NEXT: v_mov_b32_e32 v45, s72 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s74 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s76 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v45, s78 ; VI-NEXT: v_mov_b32_e32 v55, s88 +; VI-NEXT: v_mov_b32_e32 v35, s30 +; VI-NEXT: v_mov_b32_e32 v41, s58 ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v36, s66 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s85 +; VI-NEXT: v_mov_b32_e32 v34, s38 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v52, s64 -; VI-NEXT: v_mov_b32_e32 v55, v50 -; VI-NEXT: v_mov_b32_e32 v35, s30 ; VI-NEXT: v_mov_b32_e32 v59, s87 +; VI-NEXT: v_mov_b32_e32 v41, s60 +; VI-NEXT: v_mov_b32_e32 v55, v50 ; VI-NEXT: v_mov_b32_e32 v58, s34 ; VI-NEXT: v_mov_b32_e32 v45, s36 -; VI-NEXT: v_mov_b32_e32 v34, s38 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v50, v46 +; VI-NEXT: v_mov_b32_e32 v46, v48 +; VI-NEXT: v_mov_b32_e32 v48, v47 +; VI-NEXT: v_mov_b32_e32 v47, v56 +; VI-NEXT: v_mov_b32_e32 v56, v51 +; VI-NEXT: v_mov_b32_e32 v51, s90 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, s48 ; VI-NEXT: v_mov_b32_e32 v1, s44 ; VI-NEXT: v_mov_b32_e32 v2, s45 ; VI-NEXT: v_mov_b32_e32 v3, s42 @@ -170123,37 +169997,19 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v30, s29 ; VI-NEXT: v_mov_b32_e32 v32, s5 ; VI-NEXT: v_mov_b32_e32 v41, s62 +; VI-NEXT: v_mov_b32_e32 v51, v53 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: v_mov_b32_e32 v54, v40 +; VI-NEXT: v_mov_b32_e32 v40, s80 ; VI-NEXT: v_mov_b32_e32 v57, s81 ; VI-NEXT: v_mov_b32_e32 v37, s84 +; VI-NEXT: v_mov_b32_e32 v58, s50 ; VI-NEXT: v_mov_b32_e32 v60, s52 ; VI-NEXT: v_mov_b32_e32 v38, s51 ; VI-NEXT: v_mov_b32_e32 v61, s65 ; VI-NEXT: v_mov_b32_e32 v49, s66 -; VI-NEXT: v_mov_b32_e32 v39, s55 -; VI-NEXT: v_mov_b32_e32 v50, v46 -; VI-NEXT: v_mov_b32_e32 v46, v48 -; VI-NEXT: v_mov_b32_e32 v48, v47 -; VI-NEXT: v_mov_b32_e32 v47, v56 -; VI-NEXT: v_mov_b32_e32 v56, v51 -; VI-NEXT: v_mov_b32_e32 v51, s90 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s85 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v34, s48 -; VI-NEXT: v_mov_b32_e32 v51, v53 -; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: v_mov_b32_e32 v54, v40 -; VI-NEXT: v_mov_b32_e32 v40, s80 -; VI-NEXT: v_mov_b32_e32 v58, s50 ; VI-NEXT: v_mov_b32_e32 v45, s53 +; VI-NEXT: v_mov_b32_e32 v39, s55 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: .LBB91_5: ; %end @@ -170462,9 +170318,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -170542,9 +170399,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -170562,20 +170433,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -172164,11 +172021,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -172183,6 +172036,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload @@ -172194,7 +172051,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 @@ -173744,7 +173601,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-TRUE16-NEXT: s_clause 0x3 +; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 @@ -173757,7 +173614,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 @@ -175314,7 +175171,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-FAKE16-NEXT: s_clause 0x3 +; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 @@ -175488,9 +175345,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 @@ -175508,6 +175362,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -175525,15 +175382,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 @@ -175669,34 +175526,37 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 @@ -175716,7 +175576,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 @@ -175726,7 +175589,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 @@ -175752,14 +175617,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -175882,7 +175739,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xff, v42 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 @@ -176540,25 +176396,18 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 @@ -177265,9 +177114,24 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -177574,22 +177438,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -177657,8 +177505,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 @@ -177754,13 +177602,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -177888,14 +177748,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177903,26 +177768,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -177931,35 +177776,57 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177992,39 +177859,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -178190,17 +178037,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -178885,8 +178724,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 @@ -178997,13 +178836,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -179136,14 +178989,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179151,26 +179009,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -179179,36 +179017,62 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179231,49 +179095,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179439,17 +179279,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 @@ -180795,7 +180627,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 @@ -180828,7 +180660,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 @@ -181657,7 +181489,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB92_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 @@ -181690,7 +181522,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 @@ -183232,17 +183064,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload @@ -183256,6 +183077,17 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload @@ -183515,33 +183347,53 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -183586,52 +183438,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -183651,6 +183457,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill @@ -183659,7 +183466,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill @@ -183691,6 +183497,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB93_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload @@ -183715,15 +183540,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload @@ -183773,10 +183601,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -183784,50 +183613,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v42, v43 ; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -183842,13 +183658,12 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -183870,21 +183685,28 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v63, v39 +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -183902,11 +183724,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -183939,7 +183760,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_branch .LBB93_3 ; VI-NEXT: .LBB93_2: ; VI-NEXT: v_mov_b32_e32 v47, v54 -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -183960,6 +183780,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v58, v7 ; VI-NEXT: v_mov_b32_e32 v57, v5 ; VI-NEXT: v_mov_b32_e32 v56, v3 @@ -184551,29 +184372,51 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -184637,82 +184480,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -184733,6 +184536,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(55) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB93_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -184986,14 +184796,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -185003,7 +184812,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: .LBB93_2: ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -185015,6 +184823,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v49, v39 @@ -185480,7 +185289,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -186210,7 +186019,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l ; GFX11-TRUE16-NEXT: .LBB93_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -186252,7 +186061,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -187036,7 +186845,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB93_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -189053,13 +188862,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -189076,6 +188878,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -189098,27 +188907,42 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v44, v12 ; VI-NEXT: v_mov_b32_e32 v12, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v32, v20 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v43, v11 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; VI-NEXT: v_mov_b32_e32 v32, v20 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, v22 ; VI-NEXT: v_mov_b32_e32 v54, v21 ; VI-NEXT: v_mov_b32_e32 v31, v19 +; VI-NEXT: v_mov_b32_e32 v43, v11 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v44 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr4 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr63 @@ -189130,47 +188954,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 -; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; VI-NEXT: ; implicit-def: $vgpr15 ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr4 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 @@ -189179,38 +188994,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr28 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr0 @@ -189254,8 +189037,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr10 @@ -189293,28 +189102,49 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v56, v38 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v45, v7 -; VI-NEXT: v_mov_b32_e32 v63, v53 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, v3 +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v28, v48 ; VI-NEXT: v_mov_b32_e32 v48, v16 ; VI-NEXT: v_mov_b32_e32 v16, v40 ; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v53 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v32 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v44 ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v13, 24, v18 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v38 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v37 @@ -189326,83 +189156,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v62, v36 -; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v10 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v7 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v7 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53 -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v62, v36 ; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[37:38] @@ -189417,61 +189184,94 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34] -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36 -; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36] +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40 +; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 +; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34 ; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[52:53] ; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[58:59] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27 ; VI-NEXT: v_mov_b32_e32 v53, v63 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v34, v14 -; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40 +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 ; VI-NEXT: v_mov_b32_e32 v7, v45 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 -; VI-NEXT: v_mov_b32_e32 v3, v15 -; VI-NEXT: v_mov_b32_e32 v15, v29 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: v_mov_b32_e32 v38, v56 -; VI-NEXT: v_mov_b32_e32 v29, v41 ; VI-NEXT: v_mov_b32_e32 v45, v60 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59 +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3] +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26 +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 ; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v50 -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50 -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36] ; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[49:50] ; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[39:40] ; VI-NEXT: v_mov_b32_e32 v58, v51 +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34] ; VI-NEXT: v_mov_b32_e32 v36, v62 ; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40 -; VI-NEXT: v_mov_b32_e32 v40, v16 -; VI-NEXT: v_mov_b32_e32 v16, v48 -; VI-NEXT: v_mov_b32_e32 v48, v28 -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v27, v19 +; VI-NEXT: v_mov_b32_e32 v34, v14 ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v40, v16 +; VI-NEXT: v_mov_b32_e32 v16, v48 +; VI-NEXT: v_mov_b32_e32 v48, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55 +; VI-NEXT: v_mov_b32_e32 v3, v15 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v38, v56 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v39 +; VI-NEXT: v_mov_b32_e32 v29, v41 ; VI-NEXT: v_mov_b32_e32 v39, v47 ; VI-NEXT: v_mov_b32_e32 v47, v4 ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v54 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55 ; VI-NEXT: .LBB94_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_4 ; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v63, 0x200 ; VI-NEXT: v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 @@ -189490,36 +189290,47 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 ; VI-NEXT: v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_e32 v14, v31, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 ; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_e32 v62, v55, v0 ; VI-NEXT: v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 ; VI-NEXT: v_or_b32_e32 v61, v54, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v26, v54 ; VI-NEXT: v_mov_b32_e32 v27, v55 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 ; VI-NEXT: v_or_b32_e32 v34, v25, v0 ; VI-NEXT: v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v33, v24, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -189527,13 +189338,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v36, v2, v0 ; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v35, v1, v0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill @@ -189542,38 +189361,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v38, v2, v0 ; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v37, v1, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: v_or_b32_e32 v49, v9, v0 -; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v49, v9, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; VI-NEXT: v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v48, v8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v31 ; VI-NEXT: v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v10, v32 @@ -189591,11 +189406,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v53, v2, v0 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v44, 0x200, v44 ; VI-NEXT: v_or_b32_e32 v52, v1, v0 @@ -189612,28 +189427,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v46, v2, v0 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; VI-NEXT: v_or_b32_e32 v45, v1, v0 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v5, v7, v0 ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v4, v6, v0 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 @@ -189641,36 +189460,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v41, v7, v0 ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v40, v6, v0 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 ; VI-NEXT: v_or_b32_e32 v7, v25, v0 ; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v46 ; VI-NEXT: v_or_b32_e32 v6, v24, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 @@ -189679,7 +189475,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v31, v43, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 ; VI-NEXT: v_or_b32_e32 v30, v2, v0 -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 ; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -189695,8 +189490,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v30 @@ -189714,21 +189507,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; VI-NEXT: v_mov_b32_e32 v32, v10 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] -; VI-NEXT: v_mov_b32_e32 v32, v10 ; VI-NEXT: v_mov_b32_e32 v31, v9 ; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] ; VI-NEXT: v_mov_b32_e32 v7, v11 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[40:41] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, v27 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v54, v26 ; VI-NEXT: v_mov_b32_e32 v26, v20 ; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 @@ -189736,23 +189529,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v5, v22 ; VI-NEXT: v_mov_b32_e32 v13, v21 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[45:46] -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[50:51] -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v48 ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[48:49] -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v36 @@ -189760,27 +189544,39 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36] ; VI-NEXT: v_mov_b32_e32 v36, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62 -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61 -; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v14 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49 ; VI-NEXT: v_mov_b32_e32 v48, v56 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v33 ; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[33:34] ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[14:15] ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v58 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61 +; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v57 -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v23 ; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v40 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v14, v8 +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v40, v42 ; VI-NEXT: v_bfe_u32 v8, v42, 8, 8 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v38 ; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v37 ; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[37:38] @@ -189797,26 +189593,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v51, v48, 8, 8 ; VI-NEXT: v_bfe_u32 v57, v7, 8, 8 ; VI-NEXT: v_bfe_u32 v58, v60, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_bfe_u32 v34, v62, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_bfe_u32 v2, v2, 8, 8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v9, v9, 8, 8 ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v5, v5, 8, 8 ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v13, v13, 8, 8 -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_bfe_u32 v2, v2, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_bfe_u32 v42, v0, 8, 8 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_bfe_u32 v34, v62, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload @@ -189866,27 +189660,25 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v57 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -189898,9 +189690,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v12 @@ -189953,7 +189749,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -189962,14 +189760,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v12 @@ -189991,11 +189786,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v12 @@ -190003,7 +189795,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -190053,7 +189847,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61 ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v12 @@ -190067,12 +189863,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -190088,35 +189881,35 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v21 ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v30 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v12 @@ -190135,13 +189928,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -190161,12 +189951,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -190176,15 +189964,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7c, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -190201,28 +189981,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7c, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v64f16_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -190285,6 +190057,23 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 @@ -190315,7 +190104,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -190349,7 +190137,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -190472,101 +190260,100 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: s_waitcnt vmcnt(62) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 @@ -190582,6 +190369,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -190607,7 +190395,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] ; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -191148,17 +190936,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -191175,6 +190953,18 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -191633,7 +191423,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -191654,10 +191448,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -192293,7 +192083,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -193940,13 +193730,27 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v47 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s6, s7, 0xff ; SI-NEXT: s_lshl_b32 s7, s51, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v47 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s6, v1 @@ -193979,21 +193783,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s45, v62, 17 ; SI-NEXT: v_readlane_b32 s43, v62, 23 ; SI-NEXT: v_readlane_b32 s41, v62, 29 @@ -194001,6 +193790,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_readlane_b32 s27, v62, 41 ; SI-NEXT: v_readlane_b32 s25, v62, 45 ; SI-NEXT: v_readlane_b32 s9, v62, 49 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -194483,8 +194273,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v10 @@ -194492,6 +194280,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v9 ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v13 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v12 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] @@ -194499,12 +194288,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16 -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 @@ -194512,14 +194295,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16 ; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v18 ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v15 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v35 ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill @@ -194554,6 +194343,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_bfe_u32 v11, v52, 8, 8 ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33 ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29 ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 @@ -195163,9 +194953,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 ; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -195174,13 +194966,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -195312,9 +195101,22 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -195382,20 +195184,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -195713,42 +195501,42 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 @@ -196400,9 +196188,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 @@ -196434,10 +196224,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -196688,8 +196475,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 ; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -196704,6 +196489,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -196715,7 +196502,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 @@ -196750,7 +196537,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 s99, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 @@ -197669,7 +197456,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v74, off, s32 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 @@ -197731,7 +197518,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_readlane_b32 s31, v75, 1 ; GFX11-NEXT: v_readlane_b32 s30, v75, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 @@ -197782,11 +197569,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v15 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v15 ; SI-NEXT: v_mov_b32_e32 v57, v5 ; SI-NEXT: v_mov_b32_e32 v41, v3 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392 @@ -197876,7 +197663,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -197884,28 +197694,21 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 ; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -197913,240 +197716,211 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 @@ -198158,15 +197932,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill @@ -198202,7 +197980,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 @@ -198682,15 +198460,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v6, v13 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v5 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v7, v25, v5, 16 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v6, v6, v11 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v8 ; SI-NEXT: s_waitcnt expcnt(0) @@ -199918,14 +199696,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -199942,6 +199714,12 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -200009,8 +199787,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 @@ -200106,13 +199884,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -200240,14 +200030,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200255,26 +200050,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -200283,35 +200058,57 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200344,39 +200141,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200542,17 +200319,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -201237,8 +201006,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 @@ -201349,13 +201118,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -201488,14 +201271,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201503,26 +201291,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -201531,36 +201299,62 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201583,49 +201377,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201791,17 +201561,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 @@ -203147,7 +202909,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 @@ -203180,7 +202942,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 @@ -204009,7 +203771,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 @@ -204042,7 +203804,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf +; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 @@ -204087,7 +203849,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 @@ -204097,9 +203858,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_writelane_b32 v41, s30, 0 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v43, s29, 0 ; SI-NEXT: v_writelane_b32 v43, s28, 1 ; SI-NEXT: v_writelane_b32 v43, s27, 2 @@ -204148,6 +203909,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s96, 32 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; SI-NEXT: v_readfirstlane_b32 s39, v26 ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s47, v12 @@ -204170,9 +203937,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s59, v28 ; SI-NEXT: v_readfirstlane_b32 s60, v27 ; SI-NEXT: v_readfirstlane_b32 s11, v1 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 @@ -204181,30 +203946,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: v_writelane_b32 v43, s4, 15 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; SI-NEXT: v_writelane_b32 v43, s4, 16 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v43, s4, 17 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v43, s4, 18 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s44, v36 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s90, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s6, v38 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v9 ; SI-NEXT: v_readfirstlane_b32 s14, v10 ; SI-NEXT: v_readfirstlane_b32 s15, v8 ; SI-NEXT: v_readfirstlane_b32 s18, v7 @@ -204218,6 +203981,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s77, v15 ; SI-NEXT: v_readfirstlane_b32 s38, v25 ; SI-NEXT: v_writelane_b32 v41, s99, 35 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s93, v55 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s95, v40 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 19 @@ -204294,39 +204061,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v43, s4, 30 ; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: v_writelane_b32 v43, s4, 31 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s9, v35 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: v_writelane_b32 v43, s4, 33 ; SI-NEXT: v_readfirstlane_b32 s10, v36 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 34 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: v_writelane_b32 v43, s4, 35 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v39 ; SI-NEXT: v_writelane_b32 v43, s4, 36 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s69, v48 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s30, v49 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s16, v50 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s36, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 @@ -204340,7 +204103,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; SI-NEXT: v_writelane_b32 v43, s4, 37 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v52 ; SI-NEXT: v_writelane_b32 v43, s4, 38 ; SI-NEXT: v_readfirstlane_b32 s4, v53 @@ -204367,9 +204130,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v43, s43, 58 ; SI-NEXT: v_writelane_b32 v43, s76, 59 ; SI-NEXT: v_writelane_b32 v43, s77, 60 -; SI-NEXT: v_readfirstlane_b32 s93, v55 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s95, v40 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s17, v33 ; SI-NEXT: s_waitcnt vmcnt(9) @@ -205938,33 +205698,53 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -206009,52 +205789,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -206074,6 +205808,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill @@ -206082,7 +205817,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill @@ -206114,6 +205848,25 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB97_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload @@ -206138,15 +205891,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload @@ -206196,10 +205952,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -206207,50 +205964,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v42, v43 ; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -206265,13 +206009,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206293,21 +206036,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v63, v39 +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -206325,11 +206075,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -206362,7 +206111,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_branch .LBB97_3 ; VI-NEXT: .LBB97_2: ; VI-NEXT: v_mov_b32_e32 v47, v54 -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -206383,6 +206131,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v58, v7 ; VI-NEXT: v_mov_b32_e32 v57, v5 ; VI-NEXT: v_mov_b32_e32 v56, v3 @@ -206974,29 +206723,51 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -207060,82 +206831,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -207156,6 +206887,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(55) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB97_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -207409,14 +207147,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -207426,7 +207163,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: .LBB97_2: ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -207438,6 +207174,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v49, v39 @@ -207903,7 +207640,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -208633,7 +208370,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l ; GFX11-TRUE16-NEXT: .LBB97_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1e +; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -208675,7 +208412,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -209459,7 +209196,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB97_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e +; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -209562,100 +209299,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 @@ -209785,14 +209428,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -209809,13 +209467,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -209870,12 +209521,39 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -209885,36 +209563,81 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 @@ -209936,6 +209659,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 @@ -211507,9 +211242,25 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -211533,44 +211284,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 @@ -211588,6 +211307,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 ; VI-NEXT: ; kill: killed $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; VI-NEXT: ; kill: killed $vgpr35 @@ -211884,14 +211619,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v8 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v6 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 @@ -211923,10 +211656,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 @@ -211997,10 +211726,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v46 ; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 ; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 ; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 @@ -212201,9 +211936,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v15 ; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v13, v41, v13 @@ -212211,38 +211943,35 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 ; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] @@ -212255,8 +211984,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 @@ -212325,6 +212052,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v49, v53 ; VI-NEXT: v_mov_b32_e32 v53, v38 ; VI-NEXT: v_mov_b32_e32 v38, v55 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 ; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill @@ -212336,6 +212064,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v31 ; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 ; VI-NEXT: v_bfe_u32 v31, v38, 8, 8 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: .LBB98_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -212743,9 +212478,24 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -212768,44 +212518,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v64i16_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -212868,6 +212586,23 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 @@ -212898,7 +212633,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -212932,7 +212666,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -213055,101 +212789,100 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: s_waitcnt vmcnt(62) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 @@ -213165,6 +212898,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -213189,7 +212923,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] ; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -213730,17 +213464,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -213757,6 +213481,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -214215,7 +213951,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -214236,10 +213976,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -214875,7 +214611,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -215014,26 +214750,26 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s91, v32 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s93, v33 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s55, v34 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s17, v35 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s95, v36 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s35, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s83, v38 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 @@ -215046,39 +214782,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s39, v1 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s77, v31 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s38, v32 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s48, v33 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s50, v39 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s76, v48 +; SI-NEXT: v_readfirstlane_b32 s77, v31 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s30, v49 +; SI-NEXT: v_readfirstlane_b32 s38, v32 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s34, v50 +; SI-NEXT: v_readfirstlane_b32 s48, v33 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s36, v51 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s99, v34 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s50, v39 ; SI-NEXT: v_readfirstlane_b32 s90, v35 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s92, v36 ; SI-NEXT: v_writelane_b32 v41, s90, 11 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s94, v37 ; SI-NEXT: v_writelane_b32 v41, s92, 12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s30, v49 ; SI-NEXT: v_writelane_b32 v41, s94, 13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s34, v50 ; SI-NEXT: v_writelane_b32 v41, s30, 14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s36, v51 ; SI-NEXT: v_writelane_b32 v41, s34, 15 ; SI-NEXT: v_writelane_b32 v41, s36, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 ; SI-NEXT: v_writelane_b32 v41, s38, 17 +; SI-NEXT: v_readfirstlane_b32 s76, v48 +; SI-NEXT: v_readfirstlane_b32 s99, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_writelane_b32 v41, s48, 18 ; SI-NEXT: v_writelane_b32 v41, s50, 19 @@ -218060,48 +217791,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; GFX9-NEXT: v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] -; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] -; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 +; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 @@ -218753,9 +218484,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 @@ -218787,10 +218520,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -219041,8 +218771,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 ; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -219057,6 +218785,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -219068,7 +218798,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 @@ -219103,7 +218833,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 s99, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 @@ -220022,7 +219752,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v74, off, s32 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 @@ -220084,7 +219814,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s31, v75, 1 ; GFX11-NEXT: v_readlane_b32 s30, v75, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 @@ -221102,9 +220832,24 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -221449,28 +221194,14 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -221487,9 +221218,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -221738,7 +221467,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -222104,6 +221832,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-LABEL: bitcast_v64bf16_to_v64f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -222120,9 +221851,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -222341,7 +222070,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -222641,7 +222370,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -223201,7 +222930,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v42 :: v_dual_mov_b32 v11, v43 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v44 :: v_dual_mov_b32 v13, v45 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v46 :: v_dual_mov_b32 v15, v47 -; GFX11-TRUE16-NEXT: s_clause 0xf +; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -224930,10 +224659,26 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -224966,22 +224711,6 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -228962,7 +228691,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v47 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -228979,6 +228707,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -230321,20 +230050,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v37 ; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload @@ -230351,6 +230067,19 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -231398,17 +231127,32 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16 ; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16 ; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16 +; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 @@ -231418,57 +231162,63 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 @@ -231526,31 +231276,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: .LBB104_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) @@ -231808,7 +231535,23 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 @@ -231833,28 +231576,14 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -231871,9 +231600,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -232122,7 +231849,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -232488,6 +232214,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-LABEL: bitcast_v64bf16_to_v64i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -232504,9 +232233,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -232725,7 +232452,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -234330,15 +234057,21 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 -; SI-NEXT: v_mov_b32_e32 v57, v13 ; SI-NEXT: v_mov_b32_e32 v40, v3 ; SI-NEXT: v_mov_b32_e32 v54, v50 ; SI-NEXT: v_mov_b32_e32 v46, v19 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 ; SI-NEXT: v_mov_b32_e32 v44, v15 ; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 @@ -234372,32 +234105,24 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_mov_b32_e32 v42, v43 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(1) +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v50 -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: v_mov_b32_e32 v5, v19 +; SI-NEXT: v_mov_b32_e32 v7, v15 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_mov_b32_e32 v5, v19 -; SI-NEXT: v_mov_b32_e32 v7, v15 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) @@ -234533,9 +234258,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v50 -; SI-NEXT: v_mov_b32_e32 v56, v47 ; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v53, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload @@ -234543,6 +234266,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v40, v3 ; SI-NEXT: v_mov_b32_e32 v44, v15 ; SI-NEXT: v_mov_b32_e32 v57, v13 @@ -234850,16 +234575,18 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: v_lshr_b64 v[51:52], v[25:26], 16 ; SI-NEXT: v_lshr_b64 v[52:53], v[1:2], 16 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v45, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v28, v58, v27, 16 @@ -234917,19 +234644,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 ; SI-NEXT: .LBB105_5: ; %end -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -234955,10 +234679,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -234985,12 +234707,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -235034,7 +234755,25 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -235049,10 +234788,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 @@ -235067,10 +234806,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 @@ -235078,17 +234815,18 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 @@ -235096,17 +234834,18 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 @@ -235185,22 +234924,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -238415,7 +238138,23 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 @@ -238580,22 +238319,6 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -240180,38 +239903,39 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 @@ -240222,7 +239946,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 @@ -240804,16 +240527,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -240830,6 +240543,16 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -241300,10 +241023,12 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -241315,7 +241040,24 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_mov_b32_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -241325,8 +241067,13 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v26, v3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -241335,39 +241082,22 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v22, v3, v5 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v18, v3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v54, v15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_mov_b32_e32 v12, v42 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -241385,8 +241115,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v14, v3, v5 @@ -241430,11 +241158,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload @@ -241571,27 +241294,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v50, v1 ; SI-NEXT: v_lshr_b64 v[49:50], v[35:36], 16 -; SI-NEXT: v_mov_b32_e32 v35, v44 -; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16 ; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[20:21], v[42:43], 16 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[9:10], 16 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v35, v44 +; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[40:41], 16 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 ; SI-NEXT: v_mov_b32_e32 v42, v61 ; SI-NEXT: v_mov_b32_e32 v61, v37 @@ -241685,17 +241408,18 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -241727,7 +241451,9 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 @@ -241742,10 +241468,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 @@ -241758,25 +241484,24 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 @@ -241805,9 +241530,25 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -241825,22 +241566,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -242878,9 +242603,24 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -243225,22 +242965,6 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -244323,15 +244047,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload @@ -244348,6 +244065,13 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 9041f64cb17fb..e688681c5ad09 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -15670,8 +15670,25 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v3, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -15681,7 +15698,6 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -15711,22 +15727,6 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15947,16 +15947,16 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 ; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -17964,14 +17964,6 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v34, v10 ; VI-NEXT: v_mov_b32_e32 v33, v8 ; VI-NEXT: v_mov_b32_e32 v35, v6 @@ -17988,6 +17980,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 @@ -18005,17 +18005,15 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -18046,7 +18044,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -18101,14 +18099,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 ; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_u16_e32 v0, 3, v54 ; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_add_u16_e32 v0, 3, v53 ; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v51 ; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v30 @@ -21934,6 +21932,14 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v29, v1, 8, 8 ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 @@ -22052,14 +22058,6 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -23918,18 +23916,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v40i8_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v36, v4 ; SI-NEXT: v_mov_b32_e32 v31, v2 ; SI-NEXT: v_mov_b32_e32 v35, v0 @@ -23943,6 +23929,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 @@ -23974,20 +23972,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v4 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -24027,7 +24021,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xff, v30 ; SI-NEXT: v_or_b32_e32 v6, v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 ; SI-NEXT: v_or_b32_e32 v6, v6, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v6 @@ -24105,18 +24099,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v56, v0 @@ -24232,14 +24225,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v20f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v34, v10 ; VI-NEXT: v_mov_b32_e32 v33, v8 ; VI-NEXT: v_mov_b32_e32 v35, v6 @@ -24256,6 +24241,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 @@ -24273,17 +24266,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(11) ; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -24314,7 +24305,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -24369,14 +24360,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 ; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_add_u16_e32 v0, 3, v54 ; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_add_u16_e32 v0, 3, v53 ; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_add_u16_e32 v0, 3, v51 ; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v30 @@ -28252,15 +28243,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v40i8_to_v5f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v36, v10 ; SI-NEXT: v_mov_b32_e32 v35, v8 ; SI-NEXT: v_mov_b32_e32 v34, v6 @@ -28277,6 +28259,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: s_waitcnt expcnt(0) @@ -28295,17 +28286,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28368,7 +28356,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v8, v8, v23 @@ -28508,7 +28496,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 @@ -28557,15 +28545,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v5f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v36, v10 ; VI-NEXT: v_mov_b32_e32 v35, v8 ; VI-NEXT: v_mov_b32_e32 v34, v6 @@ -28582,6 +28561,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v38, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -28599,17 +28587,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28640,7 +28625,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28748,7 +28733,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_add_u16_e32 v8, 3, v50 ; VI-NEXT: v_add_u16_e32 v10, 3, v49 ; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -28780,15 +28765,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v40i8_to_v5f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v36, v10 ; GFX9-NEXT: v_mov_b32_e32 v35, v8 ; GFX9-NEXT: v_mov_b32_e32 v34, v6 @@ -28805,6 +28781,16 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v38, v14 ; GFX9-NEXT: v_mov_b32_e32 v37, v12 ; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -28822,17 +28808,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28863,7 +28849,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28971,7 +28957,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v50 ; GFX9-NEXT: v_add_u16_e32 v9, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -32301,15 +32287,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v40i8_to_v5i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v36, v10 ; SI-NEXT: v_mov_b32_e32 v35, v8 ; SI-NEXT: v_mov_b32_e32 v34, v6 @@ -32326,6 +32303,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: s_waitcnt expcnt(0) @@ -32344,17 +32330,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -32417,7 +32400,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v8, v8, v23 @@ -32557,7 +32540,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 @@ -32606,15 +32589,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v5i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v36, v10 ; VI-NEXT: v_mov_b32_e32 v35, v8 ; VI-NEXT: v_mov_b32_e32 v34, v6 @@ -32631,6 +32605,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v38, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -32648,17 +32631,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(12) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -32689,7 +32669,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -32797,7 +32777,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_add_u16_e32 v8, 3, v50 ; VI-NEXT: v_add_u16_e32 v10, 3, v49 ; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -32829,15 +32809,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v40i8_to_v5i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v36, v10 ; GFX9-NEXT: v_mov_b32_e32 v35, v8 ; GFX9-NEXT: v_mov_b32_e32 v34, v6 @@ -32854,6 +32825,16 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v38, v14 ; GFX9-NEXT: v_mov_b32_e32 v37, v12 ; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -32871,17 +32852,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -32912,7 +32893,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -33020,7 +33001,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v50 ; GFX9-NEXT: v_add_u16_e32 v9, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index ee23420c2a662..39da45b3e5063 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -2406,13 +2406,13 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -2435,9 +2435,9 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -8424,6 +8424,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -8459,22 +8475,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8757,6 +8757,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -8790,22 +8806,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9077,9 +9077,25 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -9106,22 +9122,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11440,11 +11440,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -11453,6 +11448,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11484,7 +11484,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -11723,7 +11722,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -11972,11 +11970,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -12016,16 +12014,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -12035,6 +12026,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12044,11 +12042,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12211,7 +12208,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12221,7 +12218,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12428,11 +12424,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -12476,16 +12472,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -12495,6 +12484,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12504,11 +12500,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12671,7 +12666,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12681,7 +12676,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -17323,13 +17317,13 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -17352,9 +17346,9 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -23315,6 +23309,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -23350,22 +23360,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -23648,6 +23642,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -23681,22 +23691,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23968,9 +23962,25 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -23997,22 +24007,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -25440,6 +25434,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s67, v63, 19 ; VI-NEXT: v_readlane_b32 s66, v63, 18 ; VI-NEXT: v_readlane_b32 s65, v63, 17 @@ -25460,7 +25469,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25490,21 +25499,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] @@ -25873,6 +25867,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -25889,7 +25898,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25915,21 +25924,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -26452,11 +26446,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -26465,6 +26454,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -26496,7 +26490,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -26735,7 +26728,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -26984,11 +26976,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -27028,16 +27020,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -27047,6 +27032,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27056,11 +27048,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -27223,7 +27214,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -27233,7 +27224,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -27440,11 +27430,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -27488,16 +27478,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -27507,6 +27490,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27516,11 +27506,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -27683,7 +27672,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -27693,7 +27682,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -31688,13 +31676,13 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -31717,9 +31705,9 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -37714,6 +37702,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -37749,22 +37753,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -38047,6 +38035,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -38080,22 +38084,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -38367,9 +38355,25 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -38396,22 +38400,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -40740,11 +40728,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -40753,6 +40736,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -40784,7 +40772,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -41023,7 +41010,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -41272,11 +41258,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -41316,16 +41302,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -41335,6 +41314,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -41344,11 +41330,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -41511,7 +41496,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -41521,7 +41506,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -41728,11 +41712,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -41776,16 +41760,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -41795,6 +41772,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -41804,11 +41788,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -41971,7 +41954,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -41981,7 +41964,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -45317,13 +45299,13 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -45346,9 +45328,9 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -51165,6 +51147,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -51200,22 +51198,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -51490,6 +51472,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -51523,22 +51521,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -51802,9 +51784,25 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -51831,22 +51829,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -54188,11 +54170,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -54201,6 +54178,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -54232,7 +54214,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -54471,7 +54452,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -54720,11 +54700,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -54764,16 +54744,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -54783,6 +54756,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -54792,11 +54772,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -54959,7 +54938,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -54969,7 +54948,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -55176,11 +55154,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -55224,16 +55202,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -55243,6 +55214,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -55252,11 +55230,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -55419,7 +55396,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -55429,7 +55406,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -60580,6 +60556,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -60596,8 +60574,6 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 @@ -60661,9 +60637,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -64326,18 +64301,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v62 -; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -64354,6 +64317,18 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -64471,44 +64446,44 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v14 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v12 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v10 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v8 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v6 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14] -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] ; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9 ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v4 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4 ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 @@ -64805,8 +64780,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -64823,6 +64796,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -65094,9 +65069,25 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -65123,22 +65114,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -67243,6 +67218,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -67259,7 +67249,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -67285,21 +67275,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -67768,17 +67743,61 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v21 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v27 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -67793,25 +67812,24 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v34 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v37 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v38 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v25 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v39 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v48 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 @@ -67819,7 +67837,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 @@ -67833,57 +67850,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v29 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -67892,7 +67860,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v58 ; SI-NEXT: v_or_b32_e32 v21, v21, v26 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 @@ -68173,7 +68140,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -68198,7 +68164,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v59, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v1 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 @@ -68222,7 +68187,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v54, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -68430,8 +68394,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -68448,6 +68410,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, v37 ; SI-NEXT: v_mov_b32_e32 v2, v48 @@ -68458,7 +68422,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v12, v32 ; SI-NEXT: v_mov_b32_e32 v14, v51 ; SI-NEXT: v_mov_b32_e32 v16, v34 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v18, v52 ; SI-NEXT: v_mov_b32_e32 v20, v36 ; SI-NEXT: v_mov_b32_e32 v22, v53 @@ -70196,13 +70159,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v46, v30 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 @@ -70219,6 +70181,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s43, v1 ; SI-NEXT: v_readfirstlane_b32 s42, v0 ; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v3 @@ -70242,19 +70205,19 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v48 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v37 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v31 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v34 ; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -70280,7 +70243,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v60, v44 ; SI-NEXT: v_or_b32_e32 v44, v53, v9 ; SI-NEXT: v_or_b32_e32 v33, v1, v44 @@ -70725,12 +70688,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -70747,6 +70704,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_waitcnt expcnt(0) @@ -70758,11 +70721,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_mov_b32_e32 v10, v38 ; SI-NEXT: v_mov_b32_e32 v12, v33 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v14, v34 ; SI-NEXT: v_mov_b32_e32 v16, v48 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v18, v49 ; SI-NEXT: v_mov_b32_e32 v20, v35 ; SI-NEXT: v_mov_b32_e32 v22, v36 @@ -70770,7 +70735,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v26, v51 ; SI-NEXT: v_mov_b32_e32 v28, v54 ; SI-NEXT: v_mov_b32_e32 v30, v55 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: v_mov_b32_e32 v39, v32 @@ -72188,6 +72152,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -72204,8 +72170,6 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 @@ -72273,9 +72237,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -76994,8 +76957,24 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -77023,22 +77002,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -77351,7 +77314,23 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 ; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -77375,22 +77354,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -77663,9 +77626,25 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -77692,22 +77671,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -79163,13 +79126,12 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: s_branch .LBB105_2 ; VI-NEXT: .LBB105_4: -; VI-NEXT: v_mov_b32_e32 v1, s58 ; VI-NEXT: v_mov_b32_e32 v53, s56 ; VI-NEXT: v_mov_b32_e32 v52, s42 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v52, s44 +; VI-NEXT: v_mov_b32_e32 v1, s58 ; VI-NEXT: v_mov_b32_e32 v19, s67 ; VI-NEXT: v_mov_b32_e32 v12, s66 ; VI-NEXT: v_mov_b32_e32 v20, s65 @@ -79215,6 +79177,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v45, s78 ; VI-NEXT: v_mov_b32_e32 v42, s76 ; VI-NEXT: v_mov_b32_e32 v55, s74 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v54, s57 ; VI-NEXT: v_mov_b32_e32 v41, s59 ; VI-NEXT: v_mov_b32_e32 v44, s60 @@ -79320,6 +79283,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54 ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s67, v63, 19 ; VI-NEXT: v_readlane_b32 s66, v63, 18 ; VI-NEXT: v_readlane_b32 s65, v63, 17 @@ -79340,7 +79318,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -79372,21 +79350,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] @@ -79756,6 +79719,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -79772,7 +79750,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(16) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -79798,21 +79776,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -80286,6 +80249,14 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 @@ -80360,19 +80331,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v31 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v36 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v37 @@ -80390,7 +80352,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v16, v19, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 @@ -80403,7 +80365,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xff, v18 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v10, 0xff, v41 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 @@ -80428,6 +80389,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 @@ -80634,13 +80596,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v56 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v7, v3, v7 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 ; SI-NEXT: v_or_b32_e32 v6, v46, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -80648,12 +80609,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v35, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v7 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v7 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 @@ -80852,13 +80811,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -80875,14 +80827,21 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v8, v33 ; SI-NEXT: v_mov_b32_e32 v10, v37 ; SI-NEXT: v_mov_b32_e32 v12, v49 ; SI-NEXT: v_mov_b32_e32 v14, v53 ; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v18, v34 ; SI-NEXT: v_mov_b32_e32 v20, v36 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v22, v38 ; SI-NEXT: v_mov_b32_e32 v24, v48 ; SI-NEXT: v_mov_b32_e32 v26, v50 @@ -84461,22 +84420,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v64i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 @@ -84542,6 +84485,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 @@ -84605,11 +84564,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -85220,8 +85177,24 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -85249,22 +85222,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -85820,6 +85777,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -85853,22 +85826,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -86400,21 +86357,10 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -86431,6 +86377,17 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -88045,10 +88002,26 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v54 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_lshl_b32 s4, s76, 8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_and_b32 s4, s74, 0xff ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -88076,22 +88049,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: @@ -88805,6 +88762,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s67, v63, 19 @@ -88827,7 +88799,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -88857,21 +88829,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] @@ -90429,6 +90386,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v27 @@ -90458,28 +90417,30 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 ; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v17 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v20 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v28 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v31 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v32 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v33 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v36 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(0) @@ -90496,8 +90457,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 @@ -90513,16 +90472,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 ; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v23 ; SI-NEXT: ; kill: killed $vgpr3 @@ -90803,7 +90754,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 @@ -90829,7 +90779,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v58, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v3 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 @@ -90841,7 +90790,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v46, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v3 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v59 @@ -90854,7 +90802,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v12, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 @@ -90868,7 +90815,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v28 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 @@ -91086,11 +91032,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v2, v43 ; SI-NEXT: v_mov_b32_e32 v10, v41 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: v_mov_b32_e32 v30, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -91109,6 +91052,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v4, v33 ; SI-NEXT: v_mov_b32_e32 v6, v39 ; SI-NEXT: v_mov_b32_e32 v8, v51 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 5d4df4bde1af8..46911e7934429 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -4938,6 +4938,13 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 @@ -5037,13 +5044,6 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6164,6 +6164,14 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -6180,36 +6188,28 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -6224,14 +6224,12 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12071,6 +12069,13 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 @@ -12170,13 +12175,6 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -13435,6 +13433,14 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -13451,36 +13457,28 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -13495,14 +13493,12 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -18420,6 +18416,13 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 @@ -18519,13 +18522,6 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -19656,6 +19652,14 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -19672,36 +19676,28 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -19716,14 +19712,12 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -23978,6 +23972,13 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 @@ -24077,13 +24078,6 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -25282,6 +25276,14 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -25298,36 +25300,28 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -25342,14 +25336,12 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -26798,22 +26790,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v36i16_to_v36f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 @@ -26838,6 +26814,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -26865,7 +26857,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -26892,7 +26884,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 @@ -26977,7 +26969,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -27147,8 +27138,24 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -27203,22 +27210,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -27760,6 +27751,17 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -27842,17 +27844,6 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: @@ -28709,6 +28700,12 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 @@ -28810,12 +28807,6 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -29421,9 +29412,15 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -29493,12 +29490,6 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 44cfd6c28ca6a..6749daba296c5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -3541,6 +3541,17 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v20i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -3562,17 +3573,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -3594,13 +3594,10 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -5557,10 +5554,23 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -5656,19 +5666,6 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -11740,6 +11737,17 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v20f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -11761,17 +11769,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -11793,13 +11790,10 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -13756,10 +13750,23 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -13855,19 +13862,6 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -14435,6 +14429,10 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 @@ -14573,9 +14571,6 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -19249,6 +19244,17 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v10i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -19270,17 +19276,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -19302,13 +19297,10 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -21265,10 +21257,23 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -21364,19 +21369,6 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -25988,6 +25980,17 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v10f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -26009,17 +26012,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -26041,13 +26033,10 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -27984,10 +27973,23 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -28083,19 +28085,6 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -28635,6 +28624,11 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 @@ -28773,10 +28767,6 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -31389,6 +31379,17 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v40f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill @@ -31405,17 +31406,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; kill: killed $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr40 @@ -31472,7 +31462,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; kill: killed $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -31523,7 +31513,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v40, v48 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -31623,7 +31612,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 ; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -31643,7 +31631,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill @@ -31850,7 +31837,23 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -31888,22 +31891,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -32599,11 +32586,6 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -32620,6 +32602,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: @@ -33574,8 +33561,20 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v19, v5, v29, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -33690,18 +33689,6 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -34417,6 +34404,18 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -34451,18 +34450,6 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 87d5157b3c340..6b13e96d73999 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -3792,6 +3792,17 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v22i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -3814,17 +3825,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -3842,9 +3842,8 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -6118,8 +6117,24 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -6141,22 +6156,6 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -12755,6 +12754,17 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v22f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -12777,17 +12787,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -12805,9 +12804,8 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -15081,8 +15079,24 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -15104,22 +15118,6 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15744,6 +15742,15 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: .LBB33_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 @@ -15896,14 +15903,6 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -21004,6 +21003,17 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v11i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -21026,17 +21036,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -21054,9 +21053,8 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -23330,8 +23328,24 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -23353,22 +23367,6 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -28420,6 +28418,17 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v11f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -28442,17 +28451,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -28470,9 +28468,8 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -30724,8 +30721,24 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -30747,22 +30760,6 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -31355,6 +31352,16 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v50, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 @@ -31507,15 +31514,6 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -34944,7 +34942,23 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -34982,22 +34996,6 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35799,11 +35797,6 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -35820,6 +35813,11 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: @@ -36879,9 +36877,19 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen @@ -36994,16 +37002,6 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -37793,6 +37791,22 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 @@ -37827,22 +37841,6 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index fb2e94fc3b87a..034b8027851f4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -4045,6 +4045,22 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v24i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -4069,22 +4085,6 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -4100,21 +4100,14 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -6622,8 +6615,24 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -6699,22 +6708,6 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8179,6 +8172,8 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -8195,8 +8190,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -8223,34 +8216,34 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -13043,6 +13036,9 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen @@ -13154,9 +13150,6 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: @@ -13882,6 +13875,22 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v24f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -13906,22 +13915,6 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -13937,21 +13930,14 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -16459,8 +16445,24 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -16536,22 +16538,6 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -17233,13 +17219,27 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v55, v55, v40 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 @@ -17395,19 +17395,6 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -18157,6 +18144,8 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -18173,8 +18162,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -18201,34 +18188,34 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -22982,6 +22969,22 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v12i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -23006,22 +23009,6 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -23037,21 +23024,14 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -25559,8 +25539,24 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -25636,22 +25632,6 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -27128,6 +27108,8 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -27144,8 +27126,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -27172,34 +27152,34 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -30384,6 +30364,9 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen @@ -30495,9 +30478,6 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: @@ -31199,6 +31179,22 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v12f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -31223,22 +31219,6 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -31254,21 +31234,14 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -33752,8 +33725,24 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -33829,22 +33818,6 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -34491,13 +34464,28 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v55, v55, v40 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 @@ -34653,20 +34641,6 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -35392,6 +35366,8 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -35408,8 +35384,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -35436,34 +35410,34 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -38336,8 +38310,24 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -38524,22 +38514,6 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -39451,14 +39425,8 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -39475,6 +39443,12 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: @@ -40754,6 +40728,23 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 @@ -40767,7 +40758,6 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 @@ -40775,22 +40765,6 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -41255,6 +41229,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill @@ -41271,11 +41250,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v61, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 @@ -41320,16 +41294,12 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v50, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v25, v35 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -41694,9 +41664,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -41713,6 +41680,9 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 07cdbef82d892..8b6210d6a817a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -2741,9 +2741,14 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -2855,11 +2860,6 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4341,6 +4341,19 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v26i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -4366,19 +4379,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -4394,17 +4394,12 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -4429,9 +4424,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -4443,10 +4439,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -5032,7 +5027,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -5099,6 +5093,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -5231,6 +5226,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -5245,9 +5243,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -5266,6 +5261,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -5294,10 +5293,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -7263,11 +7258,6 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -7284,6 +7274,11 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8051,29 +8046,34 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_or_b32_e32 v43, v43, v44 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v41, v41, v42 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 ; SI-NEXT: v_or_b32_e32 v53, v53, v54 @@ -8225,11 +8225,6 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: @@ -9760,7 +9755,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -9827,6 +9821,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9959,6 +9954,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -9973,9 +9971,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -9995,6 +9990,10 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -10023,10 +10022,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -10295,14 +10290,28 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -10318,22 +10327,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -10342,8 +10335,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -10363,10 +10356,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -10407,11 +10398,11 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -10425,7 +10416,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -10463,7 +10453,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -13356,9 +13345,14 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -13470,11 +13464,6 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -14209,6 +14198,14 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14272,14 +14269,6 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: @@ -15076,6 +15065,19 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v26f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -15101,19 +15103,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -15129,17 +15118,12 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -15164,9 +15148,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -15178,10 +15163,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -15767,7 +15751,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -15834,6 +15817,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -15966,6 +15950,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -15980,9 +15967,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -16001,6 +15985,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -16029,10 +16017,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -17998,11 +17982,6 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -18019,6 +17998,11 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -18938,14 +18922,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -18962,6 +18941,11 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -20653,7 +20637,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -20720,6 +20703,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -20852,6 +20836,9 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -20866,9 +20853,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -20888,6 +20872,10 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -20916,10 +20904,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -21188,14 +21172,28 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -21211,22 +21209,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -21235,8 +21217,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -21256,10 +21238,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -21300,11 +21280,11 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -21318,7 +21298,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -21356,7 +21335,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -23409,9 +23387,14 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -23523,11 +23506,6 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -25023,6 +25001,19 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v13i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -25048,19 +25039,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -25076,17 +25054,12 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -25111,9 +25084,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -25125,10 +25099,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -25714,7 +25687,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -25781,6 +25753,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -25913,6 +25886,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -25927,9 +25903,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -25948,6 +25921,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -25976,10 +25953,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -27946,11 +27919,6 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -27967,6 +27935,11 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -28748,29 +28721,34 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_or_b32_e32 v43, v43, v44 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v41, v41, v42 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v55, v55, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 ; SI-NEXT: v_or_b32_e32 v53, v53, v54 @@ -28922,11 +28900,6 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: @@ -30457,7 +30430,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -30524,6 +30496,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -30656,6 +30629,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -30670,9 +30646,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -30692,6 +30665,10 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -30720,10 +30697,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -30992,14 +30965,28 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -31015,22 +31002,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -31039,8 +31010,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -31060,10 +31031,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -31104,11 +31073,11 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -31122,7 +31091,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -31160,7 +31128,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -32398,9 +32365,14 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -32512,11 +32484,6 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -33212,6 +33179,14 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -33275,14 +33250,6 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: @@ -34053,6 +34020,19 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v13f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -34078,19 +34058,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -34106,17 +34073,12 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -34141,9 +34103,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -34155,10 +34118,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -34744,7 +34706,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -34811,6 +34772,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -34943,6 +34905,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -34957,9 +34922,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -34978,6 +34940,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -35006,10 +34972,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -36949,11 +36911,6 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -36970,6 +36927,11 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -37850,14 +37812,9 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -37874,6 +37831,11 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -39539,7 +39501,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -39606,6 +39567,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -39738,6 +39700,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -39752,9 +39717,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -39774,6 +39736,10 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -39802,10 +39768,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -40074,14 +40036,28 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -40097,22 +40073,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -40121,8 +40081,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -40142,10 +40102,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -40186,11 +40144,11 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -40204,7 +40162,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -40242,7 +40199,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -41870,8 +41826,24 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -42102,22 +42074,6 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -43338,23 +43294,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -43371,6 +43311,22 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -44684,9 +44640,25 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen @@ -44714,22 +44686,6 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -45248,6 +45204,15 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill @@ -45264,15 +45229,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 @@ -45317,26 +45273,19 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v41, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v54, s29 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 -; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v28 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v55, v36 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v27, v39 -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 @@ -45729,9 +45678,25 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -45759,22 +45724,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 8eb71e90f8504..09cf27810a5c9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -2928,9 +2928,18 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -3030,15 +3039,6 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4665,6 +4665,11 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v28i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -4694,11 +4699,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -4715,9 +4715,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -5413,7 +5412,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -5486,6 +5484,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -5634,6 +5633,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -5648,9 +5650,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -5669,6 +5668,10 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -5697,10 +5700,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -7830,21 +7829,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -7861,6 +7846,20 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8743,6 +8742,15 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0 @@ -8888,15 +8896,6 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: @@ -10560,7 +10559,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -10633,6 +10631,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -10781,6 +10780,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -10795,9 +10797,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -10817,6 +10816,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -10845,10 +10848,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -11148,7 +11147,20 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -11156,7 +11168,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -11188,19 +11199,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -11217,11 +11215,11 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -11299,6 +11297,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -11317,7 +11316,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -11585,7 +11583,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -14434,9 +14431,18 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14536,15 +14542,6 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15382,9 +15379,21 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15412,18 +15421,6 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: @@ -16290,6 +16287,11 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v28f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -16319,11 +16321,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -16340,9 +16337,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -17038,7 +17034,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -17111,6 +17106,7 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -17259,6 +17255,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -17273,9 +17272,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -17294,6 +17290,10 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -17322,10 +17322,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -19455,21 +19451,7 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -19486,6 +19468,20 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -20473,28 +20469,12 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -20511,6 +20491,22 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -22343,7 +22339,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -22416,6 +22411,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -22564,6 +22560,9 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -22578,9 +22577,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -22600,6 +22596,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -22628,10 +22628,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -22931,7 +22927,20 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -22939,7 +22948,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -22971,19 +22979,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -23000,11 +22995,11 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -23082,6 +23077,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -23100,7 +23096,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -23368,7 +23363,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -25329,9 +25323,18 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -25431,15 +25434,6 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -27080,6 +27074,11 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v14i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -27109,11 +27108,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -27130,9 +27124,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -27828,7 +27821,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -27901,6 +27893,7 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28049,6 +28042,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -28063,9 +28059,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -28084,6 +28077,10 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -28112,10 +28109,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -30245,21 +30238,7 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -30276,6 +30255,20 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -31172,6 +31165,15 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0 @@ -31317,15 +31319,6 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: @@ -32989,7 +32982,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -33062,6 +33054,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -33210,6 +33203,9 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -33224,9 +33220,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -33246,6 +33239,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -33274,10 +33271,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -33577,7 +33570,20 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -33585,7 +33591,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -33617,19 +33622,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -33646,11 +33638,11 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -33728,6 +33720,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -33746,7 +33739,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -34014,7 +34006,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -35112,9 +35103,18 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -35214,15 +35214,6 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -36018,9 +36009,21 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -36048,18 +36051,6 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: @@ -36898,6 +36889,11 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v14f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -36927,11 +36923,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -36948,9 +36939,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -37646,7 +37636,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -37719,6 +37708,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -37867,6 +37857,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -37881,9 +37874,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -37902,6 +37892,10 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -37930,10 +37924,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -40041,14 +40031,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -40065,6 +40048,13 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -41035,11 +41025,6 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -41056,6 +41041,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -42860,7 +42850,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -42933,6 +42922,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -43081,6 +43071,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -43095,9 +43088,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -43117,6 +43107,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -43145,10 +43139,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -43448,7 +43438,20 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -43456,7 +43459,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -43488,19 +43490,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -43517,11 +43506,11 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -43599,6 +43588,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -43617,7 +43607,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -43885,7 +43874,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -45395,8 +45383,24 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -45662,22 +45666,6 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -47041,6 +47029,22 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -47061,22 +47065,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -48513,6 +48501,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 @@ -48535,22 +48539,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -49691,6 +49679,22 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -49712,22 +49716,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 93c11f13ce3ce..4175d5f2de73d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -3108,9 +3108,22 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -3186,19 +3199,6 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5032,40 +5032,53 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -5096,27 +5109,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5201,7 +5197,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -5346,7 +5341,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -5494,7 +5488,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v30i32: @@ -5776,7 +5770,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -5855,6 +5848,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6019,6 +6013,9 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -6033,9 +6030,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -6054,6 +6048,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -6082,10 +6080,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -8387,11 +8381,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -8408,6 +8397,11 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9393,11 +9387,24 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0 ; SI-NEXT: v_or_b32_e32 v36, v38, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 -; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 ; SI-NEXT: v_or_b32_e32 v34, v36, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen @@ -9519,19 +9526,6 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: @@ -10345,6 +10339,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -10373,23 +10370,12 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -10399,8 +10385,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -10422,9 +10406,18 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -10434,6 +10427,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -10471,7 +10465,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -10486,6 +10479,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -11357,7 +11351,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -11436,6 +11429,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -11600,6 +11594,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -11614,9 +11611,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -11636,6 +11630,10 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -11664,10 +11662,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -11988,12 +11982,35 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -12003,7 +12020,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12012,7 +12029,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12021,7 +12038,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -12032,38 +12049,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -12088,12 +12079,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -12202,12 +12193,10 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -15531,9 +15520,22 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15609,19 +15611,6 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -16552,12 +16541,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -16573,6 +16557,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: @@ -17570,40 +17559,53 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -17634,27 +17636,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -17739,7 +17724,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -17884,7 +17868,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -18032,7 +18015,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v30f32: @@ -18314,7 +18297,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -18393,6 +18375,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -18557,6 +18540,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -18571,9 +18557,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -18592,6 +18575,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -18620,10 +18607,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -20925,11 +20908,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -20946,6 +20924,11 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -21861,7 +21844,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 ; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen @@ -22028,24 +22011,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -22062,6 +22028,23 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -23044,6 +23027,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -23072,23 +23058,12 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -23098,8 +23073,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -23121,9 +23094,18 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -23133,6 +23115,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -23170,7 +23153,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -23185,6 +23167,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -24056,7 +24039,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -24135,6 +24117,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -24299,6 +24282,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -24313,9 +24299,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -24335,6 +24318,10 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -24363,10 +24350,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -24687,50 +24670,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill @@ -24755,14 +24694,55 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -24787,12 +24767,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -24901,12 +24881,10 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB35_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -27300,9 +27278,22 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -27378,19 +27369,6 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -29240,40 +29218,53 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -29304,27 +29295,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -29409,7 +29383,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -29554,7 +29527,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -29702,7 +29674,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v15i64: @@ -29984,7 +29956,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -30063,6 +30034,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -30227,6 +30199,9 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -30241,9 +30216,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -30262,6 +30234,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -30290,10 +30266,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -32596,11 +32568,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -32617,6 +32584,11 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -33618,11 +33590,24 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0 ; SI-NEXT: v_or_b32_e32 v36, v38, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 -; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 ; SI-NEXT: v_or_b32_e32 v34, v36, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen @@ -33744,19 +33729,6 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: @@ -34570,6 +34542,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -34598,23 +34573,12 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -34624,8 +34588,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -34647,9 +34609,18 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -34659,6 +34630,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -34696,7 +34668,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -34711,6 +34682,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -35582,7 +35554,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -35661,6 +35632,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -35825,6 +35797,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -35839,9 +35814,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -35861,6 +35833,10 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -35889,10 +35865,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -36213,12 +36185,35 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -36228,7 +36223,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -36237,7 +36232,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -36246,7 +36241,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -36257,38 +36252,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -36313,12 +36282,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -36427,12 +36396,10 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB47_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -37922,9 +37889,22 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -38000,19 +37980,6 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -38899,12 +38866,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -38921,6 +38883,11 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: @@ -39888,40 +39855,53 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -39952,27 +39932,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -40057,7 +40020,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -40202,7 +40164,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -40350,7 +40311,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v15f64: @@ -40632,7 +40593,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -40711,6 +40671,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -40875,6 +40836,9 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -40889,9 +40853,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -40910,6 +40871,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -40938,10 +40903,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -43173,8 +43134,24 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -43218,22 +43195,6 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -44289,11 +44250,6 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -44310,6 +44266,11 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -45262,6 +45223,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -45290,23 +45254,12 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -45316,8 +45269,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -45339,9 +45290,18 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -45351,6 +45311,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -45388,7 +45349,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -45403,6 +45363,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -46274,7 +46235,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -46353,6 +46313,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -46517,6 +46478,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -46531,9 +46495,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -46553,6 +46514,10 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -46581,10 +46546,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -46905,50 +46866,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill @@ -46973,14 +46890,55 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -47005,12 +46963,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -47119,12 +47077,10 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB55_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -49301,7 +49257,23 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -49339,22 +49311,6 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -50856,6 +50812,22 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -50876,22 +50848,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -51893,27 +51849,27 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -52448,18 +52404,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -52476,6 +52421,17 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -53259,6 +53215,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -53285,10 +53243,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v51, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -53300,8 +53261,26 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill @@ -53329,17 +53308,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v18, v3, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v51, v11 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -53382,52 +53355,32 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_lshr_b64 v[58:59], v[34:35], 16 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_mov_b32_e32 v8, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_or_b32_e32 v6, v3, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 ; SI-NEXT: v_mov_b32_e32 v59, v48 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; SI-NEXT: v_or_b32_e32 v4, v3, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 +; SI-NEXT: v_or_b32_e32 v6, v3, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v8 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: v_or_b32_e32 v4, v3, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 +; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 @@ -53524,14 +53477,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v32, v41 ; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 ; SI-NEXT: v_lshr_b64 v[20:21], v[11:12], 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[56:57], 16 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v11, v24 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 ; SI-NEXT: v_mov_b32_e32 v39, v31 ; SI-NEXT: v_mov_b32_e32 v31, v60 @@ -53541,7 +53495,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v37, v55 ; SI-NEXT: v_lshr_b64 v[55:56], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 ; SI-NEXT: .LBB59_3: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 @@ -53662,15 +53615,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 @@ -53681,9 +53634,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -53693,11 +53648,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 @@ -53722,8 +53675,24 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 @@ -53748,22 +53717,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 30ad46d959b7e..b6b59d809306a 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -968,14 +968,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc ; GFX8-NEXT: s_movk_i32 s4, 0x70 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29] -; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 ; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21] -; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -9552,6 +9552,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1 ; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 ; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill @@ -9563,7 +9564,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 ; GFX8-NEXT: flat_load_ushort v44, v[1:2] ; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1 @@ -9686,6 +9686,17 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 +; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0 ; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0 @@ -9816,17 +9827,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 ; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen -; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 68313807c427f..04f8ad8a02303 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -450,23 +450,38 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 @@ -976,23 +991,38 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 @@ -1159,24 +1189,23 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; SDAG-GFX1100-NEXT: s_mov_b32 s9, s12 ; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX1100-NEXT: s_mov_b32 s6, s3 -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-GFX1100-NEXT: s_mov_b32 s8, s1 ; SDAG-GFX1100-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-GFX1100-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; SDAG-GFX1100-NEXT: s_clause 0x1 ; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x54 ; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12 ; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3 ; SDAG-GFX1100-NEXT: v_mov_b32_e32 v5, s0 +; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen +; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3 +; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2 ; SDAG-GFX1100-NEXT: s_mov_b32 s2, s1 -; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) @@ -1220,12 +1249,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; GISEL-GFX1100-NEXT: s_mov_b32 s8, s1 ; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2 ; GISEL-GFX1100-NEXT: s_mov_b32 s10, s3 -; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; GISEL-GFX1100-NEXT: s_clause 0x1 ; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54 ; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s0 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1 ; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2 ; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 8e12e7e03947b..832e43f1e1973 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -4253,6 +4253,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -4260,7 +4261,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 ; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 @@ -4272,7 +4272,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; VI-NEXT: s_swappc_b64 s[30:31], s[8:9] ; VI-NEXT: s_endpgm @@ -4285,6 +4285,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -4292,7 +4293,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 ; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 @@ -4304,7 +4304,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: s_waitcnt vmcnt(6) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; CI-NEXT: s_swappc_b64 s[30:31], s[8:9] ; CI-NEXT: s_endpgm @@ -4317,6 +4317,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -4324,7 +4325,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s3 @@ -4336,7 +4336,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 0cae0e51107df..5cc68451d5ab7 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -851,12 +851,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: ds_write_b8 v0, v1 offset:9 +; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; CI-NEXT: ds_write_b8 v0, v1 offset:5 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; CI-NEXT: ds_write_b8 v0, v1 offset:9 -; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll index 683887b0a55f3..a4b3a8544dede 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll @@ -476,7 +476,6 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24 ; GCN-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse @@ -489,6 +488,7 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -1029,7 +1029,6 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: v_accvgpr_read_b32 v5, a5 -; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24 ; GCN-NEXT: v_accvgpr_write_b32 a31, v21 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a30, v22 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a29, v23 ; Reload Reuse @@ -1040,6 +1039,7 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: v_accvgpr_write_b32 a24, v28 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a23, v29 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a22, v30 ; Reload Reuse +; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 5fb50d0d89530..da08f4fcf8f3d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -3755,42 +3755,44 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 ; CI-NEXT: v_or_b32_e32 v10, v14, v10 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_or_b32_e32 v17, v18, v17 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_or_b32_e32 v17, v18, v17 ; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; CI-NEXT: v_or_b32_e32 v13, v16, v13 ; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_or_b32_e32 v19, v20, v19 ; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; CI-NEXT: v_cvt_f16_f32_e32 v21, v30 ; CI-NEXT: v_or_b32_e32 v20, v22, v20 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; CI-NEXT: v_or_b32_e32 v21, v22, v21 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: s_waitcnt vmcnt(5) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: s_waitcnt vmcnt(4) ; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 @@ -3802,6 +3804,27 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; CI-NEXT: v_or_b32_e32 v14, v15, v14 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; CI-NEXT: v_or_b32_e32 v12, v12, v15 +; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 +; CI-NEXT: v_or_b32_e32 v11, v16, v11 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3968,28 +3991,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_or_b32_e32 v31, v32, v31 ; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 ; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; CI-NEXT: v_or_b32_e32 v12, v12, v15 -; CI-NEXT: v_or_b32_e32 v11, v16, v11 -; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 ; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 ; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir index 279f4298e6418..590d69b8eb869 100644 --- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir +++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir @@ -1,6 +1,19 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s +--- | + + @foo = addrspace(3) global i32 poison + + define void @test_overlap() { unreachable } + define void @test_dead_redef() { unreachable } + define void @test_tied() { unreachable } + define void @test_mmo_merge1() { unreachable } + define void @test_mmo_merge2() { unreachable } + define void @test_mmo_drop() { unreachable } + +... + --- name: test_overlap body: | @@ -47,3 +60,42 @@ body: | %1:vgpr_32 = COPY %0:vgpr_32 %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec ... + +--- +name: test_mmo_merge1 +body: | + bb.0: + ; CHECK-LABEL: name: test_mmo_merge1 + ; CHECK: BUNDLE implicit-def %0, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3) { + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32 + ; CHECK-NEXT: DS_WRITE_B32_gfx9 %1:vgpr_32, internal [[COPY]], 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) + ; CHECK-NEXT: } + %1:vgpr_32 = COPY %0:vgpr_32 + DS_WRITE_B32_gfx9 %0, %1, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) +... + +--- +name: test_mmo_merge2 +body: | + bb.0: + ; CHECK-LABEL: name: test_mmo_merge2 + ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3), (store (s32) into @foo + 4, addrspace 3) { + ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) + ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3) + ; CHECK-NEXT: } + DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) + DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3) +... + +--- +name: test_mmo_drop +body: | + bb.0: + ; CHECK-LABEL: name: test_mmo_drop + ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec { + ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) + ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec + ; CHECK-NEXT: } + DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) + DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index b750d28ffa7d3..d43c6ba322619 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -807,7 +807,7 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX10-NEXT: buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_writelane_b32 v100, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: s_clause 0x1f +; GFX10-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX10-NEXT: buffer_load_dword v95, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:8 @@ -863,7 +863,7 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX11-NEXT: s_mov_b32 s1, return_100xi32@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, return_100xi32@abs32@lo ; GFX11-NEXT: s_addk_i32 s32, 0x90 -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:124 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:120 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:116 @@ -898,7 +898,7 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX11-NEXT: scratch_store_b32 off, v95, s33 ; GFX11-NEXT: v_writelane_b32 v100, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v95, off, s33 ; GFX11-NEXT: scratch_load_b32 v94, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v93, off, s33 offset:8 @@ -2416,7 +2416,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:148 ; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152 ; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:156 -; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104 ; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:100 ; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:96 ; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:92 @@ -2459,7 +2458,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; GFX10-NEXT: s_clause 0x8 +; GFX10-NEXT: s_clause 0x7 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 @@ -2468,6 +2467,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104 ; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2518,7 +2518,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-LABEL: return_72xi32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xc +; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:204 @@ -2551,23 +2551,23 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 +; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112 ; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108 ; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104 -; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124 ; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: s_clause 0x10 ; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144 ; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140 ; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: s_clause 0xd ; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 ; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156 ; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152 @@ -2608,7 +2608,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: s_clause 0xc +; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164 ; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168 ; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172 @@ -2641,21 +2641,6 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 @@ -2733,6 +2718,21 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_mov_b32_e32 v29, 0 ; GFX9-NEXT: v_mov_b32_e32 v30, 0 ; GFX9-NEXT: v_mov_b32_e32 v31, 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 @@ -2914,21 +2914,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: s_mov_b32 s38, s34 ; GFX10-NEXT: s_mov_b32 s34, s32 ; GFX10-NEXT: s_add_i32 s32, s32, 0x14000 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v63, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 @@ -2971,12 +2957,11 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: v_writelane_b32 v63, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 @@ -3006,9 +2991,24 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_mov_b32_e32 v31, 0 ; GFX10-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi ; GFX10-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_writelane_b32 v63, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: s_clause 0x28 +; GFX10-NEXT: s_clause 0x3e ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:636 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 @@ -3050,30 +3050,6 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792 ; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:516 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:524 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:528 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:532 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:536 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:540 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:544 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill -; GFX10-NEXT: s_clause 0x15 ; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 ; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 ; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:556 @@ -3096,6 +3072,29 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:624 ; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:628 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:632 +; GFX10-NEXT: s_waitcnt vmcnt(22) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:524 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:528 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:532 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:536 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:540 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:544 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill ; GFX10-NEXT: v_mov_b32_e32 v0, 24 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s32 @@ -3138,7 +3137,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 -; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544 @@ -3151,7 +3150,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: s_clause 0xe +; GFX10-NEXT: s_clause 0xe ; 60-byte Folded Reload ; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 @@ -3199,7 +3198,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_mov_b32 s36, s34 ; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Spill ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36 @@ -3341,18 +3340,18 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_add_i32 s2, s32, 16 ; GFX11-NEXT: v_mov_b32_e32 v30, v46 ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 +; GFX11-NEXT: s_clause 0x3 ; 64-byte Folded Reload ; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568 ; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552 ; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536 +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 ; GFX11-NEXT: s_add_i32 s2, s33, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, 42 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Reload ; GFX11-NEXT: scratch_load_b32 v59, off, s33 ; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index f80716939f618..93d7eeb085107 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -255,11 +255,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16 ; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off ; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 16 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64 diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir index 7e1055b2a28a4..03b56cad85dac 100644 --- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir @@ -11,7 +11,7 @@ body: | ; CHECK-LABEL: name: mimg_nsa ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) { ; CHECK-NEXT: S_CLAUSE 1 ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) @@ -29,7 +29,7 @@ body: | ; CHECK-LABEL: name: mimg_nsa_mixed ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) { ; CHECK-NEXT: S_CLAUSE 2 ; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir index 9689dda9932ed..68f9e839012c3 100644 --- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir @@ -10,7 +10,7 @@ body: | ; CHECK-LABEL: name: mimg ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) { ; CHECK-NEXT: S_CLAUSE 1 ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) @@ -28,7 +28,7 @@ body: | ; CHECK-LABEL: name: mimg_mixed ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) { ; CHECK-NEXT: S_CLAUSE 2 ; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll index 4719ab9090fa5..cbf697fafe683 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll @@ -1,13 +1,20 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s -; MIR-LABEL: name: gws_barrier_offset0{{$}} -; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec { -; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") -; MIR-NEXT: S_WAITCNT 0 -; MIR-NEXT: } define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { + ; MIR-LABEL: name: gws_barrier_offset0 + ; MIR: bb.0 (%ir-block.0): + ; MIR-NEXT: liveins: $sgpr8_sgpr9 + ; MIR-NEXT: {{ $}} + ; MIR-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4) + ; MIR-NEXT: $m0 = S_MOV_B32 0 + ; MIR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec + ; MIR-NEXT: BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") { + ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") + ; MIR-NEXT: S_WAITCNT 0 + ; MIR-NEXT: } + ; MIR-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) ret void } @@ -17,5 +24,3 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { convergent inaccessiblememonly nounwind } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; MIR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll index c5f6e2b0098ae..417b8e08cf669 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -35,7 +35,7 @@ ; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] ; MIR-LABEL: name: gws_barrier_offset0{{$}} -; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec { +; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") ; MIR-NEXT: S_WAITCNT 0 ; MIR-NEXT: } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index 4419b8c6f9862..af270e5adf75c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -13,9 +13,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-TRUE16-NEXT: v_dot2_bf16_bf16 v0.l, s2, s3, v0.l ; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -26,9 +26,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-FAKE16-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1 ; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll index 0194d25a99cdc..72b47693c69f8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -12,9 +12,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l ; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -25,9 +25,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1 ; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] @@ -38,9 +38,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GISEL-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GISEL-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l ; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -51,9 +51,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GISEL-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GISEL-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GISEL-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1 ; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 0c1448a0b8fb6..1d08097452ce6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -17,21 +17,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; SDAG-NEXT: v_mov_b32_e32 v5, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] @@ -43,13 +41,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] @@ -175,16 +172,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1] -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[10:11] @@ -207,16 +203,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1] -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] @@ -520,21 +515,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1] -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; GCN-NEXT: v_mov_b32_e32 v5, s16 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] @@ -634,16 +627,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 ; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1] -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[10:11] @@ -802,11 +794,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -815,7 +807,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -833,12 +824,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -965,15 +955,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -1003,15 +992,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -1317,11 +1305,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1330,7 +1318,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1348,12 +1335,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1481,11 +1467,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1494,7 +1480,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1512,12 +1497,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1645,11 +1629,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1658,7 +1642,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1676,12 +1659,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1809,11 +1791,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1822,7 +1804,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1840,12 +1821,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1972,15 +1952,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -2010,15 +1989,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -2323,15 +2301,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -2361,15 +2338,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -2674,15 +2650,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -2712,15 +2687,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -3025,15 +2999,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -3063,15 +3036,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index f93e5f06beff9..83c240c17ff1c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -10386,7 +10386,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x150 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] ; GFX8-NEXT: v_mov_b32_e32 v13, s3 ; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x140 @@ -10395,10 +10396,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x130 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v17, s3 ; GFX8-NEXT: v_mov_b32_e32 v16, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x120 @@ -10406,20 +10403,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v19, s3 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x110 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo ; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi ; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] ; GFX8-NEXT: v_mov_b32_e32 v10, s14 ; GFX8-NEXT: v_mov_b32_e32 v11, s15 ; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31] @@ -10588,6 +10586,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] +; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index bca39d06e941c..59f4a9d44bbdd 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index e55fb2cac0985..7203545ebf9a8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -3313,12 +3313,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6 @@ -3726,7 +3726,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: s_nop 0 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_nop 0 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload @@ -3740,7 +3739,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1 @@ -3749,6 +3748,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23 @@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0) +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index f879dc660203f..cb17f01853221 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -7788,19 +7788,18 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v13, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v14, 8, 8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v17, 8, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v17 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 24, v17 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v52, v17, 16, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v53 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128 @@ -7810,7 +7809,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v53 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index bd191a37582c0..062a985dd7180 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -3172,27 +3172,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 +; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3 +; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18 +; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 @@ -3200,7 +3198,6 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20 ; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17 -; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19 @@ -3243,17 +3240,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19 ; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 +; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill +; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20 ; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0 +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24 @@ -3296,21 +3295,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 -; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: s_nop 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 @@ -3337,9 +3332,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11 +; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX9-NO-DS128-NEXT: s_nop 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17 @@ -3360,16 +3357,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19 @@ -3806,9 +3804,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; VI-DS128-NEXT: v_mov_b32_e32 v4, v3 ; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 @@ -3825,23 +3825,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill -; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 ; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 @@ -3850,21 +3843,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 ; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 ; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 +; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -3875,16 +3872,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 ; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 +; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -3943,9 +3941,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 @@ -3964,24 +3964,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 @@ -3990,21 +3982,26 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 ; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 ; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 +; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 +; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: s_nop 0 +; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -4015,16 +4012,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 ; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 +; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -4197,29 +4195,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 +; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 +; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 @@ -4229,7 +4218,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 ; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 @@ -4247,16 +4236,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 +; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 +; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 +; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill +; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 @@ -4316,23 +4313,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: s_nop 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 @@ -4342,7 +4330,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 @@ -4360,16 +4348,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 +; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX9-NO-DS128-NEXT: s_nop 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 @@ -4857,10 +4853,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_mov_b32_e32 v32, s1 ; VI-DS128-NEXT: ds_read_b128 v[8:11], v32 ; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 +; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 @@ -4873,12 +4871,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 ; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 @@ -4899,8 +4891,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 ; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 +; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill ; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -4913,14 +4908,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 @@ -4985,9 +4981,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 @@ -5001,13 +4999,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 ; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 @@ -5028,8 +5019,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 ; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 +; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: s_nop 0 +; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -5042,14 +5037,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll index 1d1d3e4a68fee..9da7a79ba2fdf 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll @@ -15,24 +15,23 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s6, s3 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_mov_b32 s8, s1 ; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, s3 ; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_mov_b32 s2, s1 -; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -63,10 +62,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, s3 @@ -100,25 +99,24 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s6, s3 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_mov_b32 s8, s1 ; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, s3 ; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_mov_b32 s2, s1 -; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen @@ -141,24 +139,23 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s6, s3 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_mov_b32 s8, s1 ; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s4, s3 ; GFX12-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU +; GFX12-NEXT: s_mov_b32 s4, s3 +; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_mov_b32 s2, s1 -; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index fc36ed939d91d..84db54c2d537f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -128,10 +128,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] ; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 ; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] -; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc ; GFX10-SDAG-NEXT: s_clause 0x1 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc ; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -181,24 +181,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX11-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s6, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc +; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX11-SDAG-NEXT: s_mov_b32 s2, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -215,12 +214,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s6, s3 @@ -239,24 +238,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 s6, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT +; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX12-SDAG-NEXT: s_mov_b32 s2, s1 -; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -273,12 +271,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s6, s3 @@ -413,11 +411,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] ; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 ; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] -; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: s_clause 0x1 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 +; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -468,25 +466,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s6, s3 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX11-SDAG-NEXT: s_mov_b32 s2, s1 -; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc @@ -503,13 +500,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s6, s3 @@ -528,25 +525,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 s6, s3 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 +; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX12-SDAG-NEXT: s_mov_b32 s2, s1 -; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS @@ -563,13 +559,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s10, s3 -; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s6, s3 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll index 4ab05c2923fdb..c1f4d7bbf650e 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -19,12 +19,12 @@ $_f2 = comdat any define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce) local_unnamed_addr #0 { ; GCN-LABEL: test: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: ds_write_b8 v1, v0 ; GCN-NEXT: ds_read_u8 v2, v1 offset:2 ; GCN-NEXT: ds_read_u16 v3, v1 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 2 +; GCN-NEXT: ds_write_b8 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b8 v1, v2 offset:6 ; GCN-NEXT: ds_write_b16 v1, v3 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll index 24c1bfb8d50f0..ccfd45bc87e71 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -75,15 +75,15 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i ; GCN-LABEL: no_clobber_ds_load_stores_x3: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c +; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 2 -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: ds_write_b32 v1, v2 offset:256 +; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: v_mov_b32_e32 v2, 3 -; GCN-NEXT: ds_write_b32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ds_write_b32 v1, v2 offset:256 +; GCN-NEXT: v_mov_b32_e32 v2, 3 ; GCN-NEXT: ds_write_b32 v1, v2 offset:512 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v3, v0 offset:256 diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index ae0805448d693..ba532949a687d 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -774,9 +774,9 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset ; GFX1250-NEXT: s_load_b32 s6, s[0:1], 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_max_u32_e32 v0, s6, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 02f39e25cb447..af7ca0fb59682 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -714,7 +714,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -1468,7 +1468,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB1_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -1854,6 +1854,10 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 ; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 @@ -1862,10 +1866,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 ; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 ; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 ; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 @@ -1901,14 +1901,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 @@ -1923,6 +1915,14 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 ; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 @@ -3438,7 +3438,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB3_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 @@ -3741,23 +3741,23 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB4_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x39 +; ALIGNED-NEXT: s_clause 0x3e ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 @@ -3779,17 +3779,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69 @@ -3797,57 +3797,96 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_clause 0x33 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(49) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 @@ -3856,76 +3895,83 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill @@ -3934,52 +3980,82 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: s_waitcnt vmcnt(38) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(36) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill @@ -4251,260 +4327,133 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v127, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x6 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 ; ALIGNED-NEXT: v_lshl_or_b32 v106, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v103, 8, v114 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v100, 8, v112 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 ; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(60) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v98 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v96 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 ; ALIGNED-NEXT: v_lshl_or_b32 v89, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v83 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 ; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v81 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v46, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 ; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 ; ALIGNED-NEXT: v_lshl_or_b32 v115, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 ; ALIGNED-NEXT: v_lshl_or_b32 v99, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v33 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v82, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v31, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 ; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v21 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 ; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 ; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 ; ALIGNED-NEXT: v_lshl_or_b32 v22, v73, 16, v4 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 16, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v6, 8, v8 ; ALIGNED-NEXT: v_lshl_or_b32 v77, v7, 8, v5 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 16, v73 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill @@ -4513,37 +4462,34 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v109, 8, v107 ; ALIGNED-NEXT: v_lshl_or_b32 v77, v1, 8, v120 +; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 ; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v1 +; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0 -; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v73, v120, 8, v122 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v77, v121, 8, v109 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 @@ -4553,6 +4499,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4625,6 +4572,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:202 ; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:203 ; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:201 @@ -4641,21 +4589,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:198 ; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:196 ; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:192 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:186 ; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:187 ; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:185 @@ -4672,18 +4605,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:182 ; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:180 ; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:170 ; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:171 ; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:169 @@ -4700,6 +4621,36 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:166 ; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:164 ; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:160 +; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154 +; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 @@ -4712,10 +4663,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 @@ -5181,6 +5128,8 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -5234,8 +5183,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 @@ -5274,7 +5221,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB4_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 @@ -6797,7 +6744,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5 ; ALIGNED-NEXT: .LBB5_6: ; %Flow6 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -8296,7 +8243,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5 ; ALIGNED-NEXT: .LBB6_6: ; %Flow8 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -8848,14 +8795,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6 @@ -8871,6 +8810,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 ; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7 @@ -9297,6 +9244,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 ; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 @@ -9305,10 +9256,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 ; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 ; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 ; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 @@ -9344,14 +9291,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 @@ -9366,6 +9305,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 ; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 @@ -12198,7 +12145,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB8_5 ; ALIGNED-NEXT: .LBB8_6: ; %Flow19 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 @@ -12645,6 +12592,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-LABEL: memmove_p0_p5_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: s_mov_b32 s6, exec_lo ; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill @@ -12693,34 +12645,29 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; ALIGNED-NEXT: s_mov_b32 s6, exec_lo ; ALIGNED-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v0 ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB9_2 ; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x39 +; ALIGNED-NEXT: s_clause 0x3e ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 @@ -12742,17 +12689,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69 @@ -12760,58 +12707,94 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_clause 0x30 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 @@ -12819,82 +12802,81 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(61) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(61) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(61) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill @@ -12902,47 +12884,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0xc +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(44) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: s_waitcnt vmcnt(32) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(16) +; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 +; ALIGNED-NEXT: s_waitcnt vmcnt(12) +; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill @@ -13214,289 +13246,158 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 +; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 +; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 -; ALIGNED-NEXT: s_waitcnt vmcnt(61) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 -; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(59) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 -; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 -; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 -; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 -; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 -; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 ; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 ; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen ; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 ; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 ; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 ; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95 ; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125 ; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill @@ -13509,7 +13410,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16 @@ -13518,10 +13418,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -13590,6 +13491,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202 ; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203 ; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201 @@ -13606,22 +13509,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198 ; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196 ; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186 ; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187 ; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185 @@ -13638,18 +13525,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182 ; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180 ; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170 ; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171 ; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169 @@ -13666,6 +13541,36 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166 ; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164 ; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 +; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 @@ -13678,10 +13583,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 @@ -14147,6 +14048,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 +; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload @@ -14200,8 +14103,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 @@ -14253,23 +14154,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x39 +; ALIGNED-NEXT: s_clause 0x3e ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37 @@ -14291,17 +14192,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69 @@ -14309,57 +14210,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_clause 0x34 +; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(49) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 @@ -14368,75 +14309,88 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x5 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill @@ -14445,52 +14399,83 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(48) +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(44) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: s_waitcnt vmcnt(43) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -14709,313 +14694,180 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:142 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:143 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x4 ; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252 ; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253 ; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254 ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x6 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 ; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(60) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 ; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 ; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 ; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 ; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 ; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 ; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 ; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 ; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 ; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen ; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -15027,36 +14879,34 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104 ; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 ; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16 @@ -15065,10 +14915,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -15137,6 +14988,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202 ; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203 ; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201 @@ -15153,22 +15006,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198 ; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196 ; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186 ; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187 ; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185 @@ -15185,18 +15022,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182 ; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180 ; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170 ; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171 ; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169 @@ -15213,6 +15038,36 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166 ; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164 ; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160 +; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 +; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 +; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 @@ -15225,10 +15080,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157 @@ -15694,6 +15545,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:640 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 +; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 +; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 +; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload @@ -15747,10 +15602,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15 @@ -15788,7 +15639,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4 ; ALIGNED-NEXT: .LBB9_5: ; %Flow11 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x2f +; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll index 71900a4d1c1e4..32800488f0633 100644 --- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll +++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll @@ -90,19 +90,19 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh ; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 ; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null ; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null ; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null +; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1 -; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1 +; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1 ; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 78207c2cf605e..1177474f5b4f5 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -185,44 +185,47 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] ; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_add_f32_e32 v4, s43, v4 ; GFX900-NEXT: v_add_f32_e32 v3, s42, v3 ; GFX900-NEXT: v_add_f32_e32 v2, s41, v2 ; GFX900-NEXT: v_add_f32_e32 v1, s40, v1 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_add_f32_e32 v8, s39, v8 -; GFX900-NEXT: v_add_f32_e32 v7, s38, v7 -; GFX900-NEXT: v_add_f32_e32 v6, s37, v6 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_add_f32_e32 v32, s19, v32 ; GFX900-NEXT: v_add_f32_e32 v31, s18, v31 ; GFX900-NEXT: v_add_f32_e32 v30, s17, v30 ; GFX900-NEXT: v_add_f32_e32 v29, s16, v29 +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_f32_e32 v8, s39, v8 +; GFX900-NEXT: v_add_f32_e32 v7, s38, v7 +; GFX900-NEXT: v_add_f32_e32 v6, s37, v6 ; GFX900-NEXT: v_add_f32_e32 v5, s36, v5 +; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_add_f32_e32 v12, s51, v12 ; GFX900-NEXT: v_add_f32_e32 v11, s50, v11 ; GFX900-NEXT: v_add_f32_e32 v10, s49, v10 ; GFX900-NEXT: v_add_f32_e32 v9, s48, v9 +; GFX900-NEXT: s_waitcnt vmcnt(2) ; GFX900-NEXT: v_add_f32_e32 v16, s47, v16 ; GFX900-NEXT: v_add_f32_e32 v15, s46, v15 ; GFX900-NEXT: v_add_f32_e32 v14, s45, v14 ; GFX900-NEXT: v_add_f32_e32 v13, s44, v13 +; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_add_f32_e32 v20, s15, v20 ; GFX900-NEXT: v_add_f32_e32 v19, s14, v19 ; GFX900-NEXT: v_add_f32_e32 v18, s13, v18 ; GFX900-NEXT: v_add_f32_e32 v17, s12, v17 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_add_f32_e32 v24, s11, v24 ; GFX900-NEXT: v_add_f32_e32 v23, s10, v23 ; GFX900-NEXT: v_add_f32_e32 v22, s9, v22 @@ -246,6 +249,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] @@ -255,9 +260,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) ; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41] ; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43] ; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) @@ -293,6 +296,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -302,9 +307,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37] ; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) @@ -340,11 +343,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fadd_v32_vs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_clause 0x2 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40 ; GFX1250-SDAG-NEXT: s_clause 0x7 ; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 ; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 @@ -354,22 +360,18 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 ; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: s_clause 0x1 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] @@ -409,6 +411,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -421,10 +426,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 ; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 ; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-GISEL-NEXT: s_clause 0x1 -; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] @@ -1442,44 +1443,47 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] ; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4 ; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3 ; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2 ; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8 -; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7 -; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32 ; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31 ; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30 ; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29 +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8 +; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7 +; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6 ; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5 +; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12 ; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11 ; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10 ; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9 +; GFX900-NEXT: s_waitcnt vmcnt(2) ; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16 ; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15 ; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14 ; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13 +; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20 ; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19 ; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18 ; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24 ; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23 ; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22 @@ -1503,6 +1507,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] @@ -1512,9 +1518,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) ; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41] ; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43] ; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) @@ -1550,6 +1554,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -1559,9 +1565,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37] ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) @@ -1597,11 +1601,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fmul_v32_vs: ; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_clause 0x2 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40 ; GFX1250-SDAG-NEXT: s_clause 0x7 ; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 ; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 @@ -1611,22 +1618,18 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 ; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: s_clause 0x1 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] @@ -1666,6 +1669,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -1678,10 +1684,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 ; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 ; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-GISEL-NEXT: s_clause 0x1 -; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] @@ -2273,44 +2275,47 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] ; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 -; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43 ; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42 ; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41 ; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39 -; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38 -; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19 ; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18 ; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17 ; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16 +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39 +; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38 +; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37 ; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36 +; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51 ; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50 ; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49 ; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48 +; GFX900-NEXT: s_waitcnt vmcnt(2) ; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47 ; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46 ; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45 ; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44 +; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15 ; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14 ; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13 ; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11 ; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10 ; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9 @@ -2334,6 +2339,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] @@ -2343,9 +2350,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) ; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41] ; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43] ; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) @@ -2381,6 +2386,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -2390,9 +2397,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) ; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37] ; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) @@ -2430,6 +2435,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -2442,10 +2450,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 ; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: s_clause 0x1 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51] @@ -2496,6 +2500,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -2508,10 +2515,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 ; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 ; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-GISEL-NEXT: s_clause 0x1 -; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir index d0d5cc11994af..025d9e63436d7 100644 --- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir +++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir @@ -56,11 +56,11 @@ body: | ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: } - ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { + ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (load (s32)) { ; GCN-NEXT: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) ; GCN-NEXT: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) ; GCN-NEXT: } - ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { + ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (store (s128)) { ; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128)) ; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128)) ; GCN-NEXT: } @@ -359,6 +359,7 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABLE: name: no_sched_barrier_within_bundle + ; GCN-LABEL: name: no_sched_barrier_within_bundle ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir index 5fea0aee72ec7..e0266b9f1a5b0 100644 --- a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir @@ -9,7 +9,7 @@ body: | ; GFX12-LABEL: name: post_bundle_vimage ; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { + ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) { ; GFX12-NEXT: $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) ; GFX12-NEXT: $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) ; GFX12-NEXT: } @@ -25,7 +25,7 @@ body: | ; GFX12-LABEL: name: post_bundle_vsample ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 { + ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 :: (dereferenceable load (s128), addrspace 8) { ; GFX12-NEXT: $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8) ; GFX12-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8) ; GFX12-NEXT: } diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 85a9aba1a0e51..b91bdd2b2fa71 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -398,11 +398,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5] ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2 ; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] ; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2 ; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] @@ -514,10 +514,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2 ; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 ; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off @@ -526,13 +524,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2 ; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 ; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048 ; GFX900-NEXT: s_addk_i32 s5, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc ; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096 -; GFX900-NEXT: s_waitcnt vmcnt(3) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22 ; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1] ; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 @@ -540,13 +540,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: s_waitcnt vmcnt(7) ; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24 ; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off ; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: s_waitcnt vmcnt(7) ; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20 ; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc ; GFX900-NEXT: s_waitcnt vmcnt(4) @@ -734,10 +734,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 -; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 ; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 ; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off @@ -753,39 +751,42 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048 ; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX90A-NEXT: s_addk_i32 s3, 0x2000 ; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff -; GFX90A-NEXT: s_waitcnt vmcnt(8) +; GFX90A-NEXT: s_waitcnt vmcnt(10) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(7) +; GFX90A-NEXT: s_waitcnt vmcnt(9) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(6) +; GFX90A-NEXT: s_waitcnt vmcnt(8) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(5) +; GFX90A-NEXT: s_waitcnt vmcnt(7) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(4) +; GFX90A-NEXT: s_waitcnt vmcnt(6) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(3) +; GFX90A-NEXT: s_waitcnt vmcnt(5) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: s_waitcnt vmcnt(4) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: s_waitcnt vmcnt(3) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc ; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index 7a3bff8aed56e..fb9c47731eb42 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -28,43 +28,38 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -74,49 +69,40 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -125,6 +111,19 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -139,6 +138,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -153,84 +153,70 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -238,6 +224,19 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -252,6 +251,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -266,83 +266,69 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -350,6 +336,18 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -364,6 +362,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -392,7 +392,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -414,7 +413,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -422,24 +420,23 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -447,9 +444,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -457,6 +453,9 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -475,6 +474,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -503,7 +504,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -525,7 +525,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -533,24 +532,23 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -558,9 +556,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -568,6 +565,9 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -586,6 +586,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -974,42 +976,43 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -1024,8 +1027,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -1051,43 +1053,38 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -1097,49 +1094,40 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -1148,6 +1136,19 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -1162,6 +1163,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1176,84 +1178,70 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -1261,6 +1249,19 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -1275,6 +1276,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1289,83 +1291,69 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -1373,6 +1361,18 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -1387,6 +1387,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1415,7 +1417,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -1437,7 +1438,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -1445,24 +1445,23 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -1470,9 +1469,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -1480,6 +1478,9 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -1498,6 +1499,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1526,7 +1529,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -1548,7 +1550,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -1556,24 +1557,23 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -1581,9 +1581,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -1591,6 +1590,9 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -1609,6 +1611,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1997,42 +2001,43 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -2047,8 +2052,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -2074,43 +2078,38 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -2120,49 +2119,40 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -2171,6 +2161,19 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -2185,6 +2188,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2199,84 +2203,70 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -2284,6 +2274,19 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -2298,6 +2301,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2312,83 +2316,69 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -2396,6 +2386,18 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -2410,6 +2412,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2438,7 +2442,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -2460,7 +2463,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -2468,24 +2470,23 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -2493,9 +2494,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -2503,6 +2503,9 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -2521,6 +2524,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2549,7 +2554,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -2571,7 +2575,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -2579,24 +2582,23 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -2604,9 +2606,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -2614,6 +2615,9 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -2632,6 +2636,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3020,42 +3026,43 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -3070,8 +3077,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -3091,49 +3097,44 @@ define amdgpu_cs float @cs_main(i32 %idx) { define amdgpu_hs float @hs_main(i32 %idx) { ; SI-LABEL: hs_main: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s7, 0xe8f000 -; SI-NEXT: s_add_u32 s4, s4, s0 -; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s7, 0xe8f000 +; SI-NEXT: s_add_u32 s4, s4, s0 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -3143,49 +3144,40 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -3194,6 +3186,19 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -3208,6 +3213,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3222,84 +3228,70 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -3307,6 +3299,19 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -3321,6 +3326,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3334,83 +3340,69 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -3418,6 +3410,18 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -3432,6 +3436,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3459,7 +3465,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -3481,7 +3486,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -3489,24 +3493,23 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -3514,9 +3517,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -3524,6 +3526,9 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -3542,6 +3547,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3569,7 +3576,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -3591,7 +3597,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -3599,24 +3604,23 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -3624,9 +3628,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -3634,6 +3637,9 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -3652,6 +3658,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4040,42 +4048,43 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -4090,8 +4099,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -4117,43 +4125,38 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -4163,49 +4166,40 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -4214,6 +4208,19 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -4228,6 +4235,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4242,84 +4250,70 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -4327,6 +4321,19 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -4341,6 +4348,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4354,83 +4362,69 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -4438,6 +4432,18 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -4452,6 +4458,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4479,7 +4487,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -4501,7 +4508,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -4509,24 +4515,23 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -4534,9 +4539,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -4544,6 +4548,9 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -4562,6 +4569,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4589,7 +4598,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -4611,7 +4619,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -4619,24 +4626,23 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -4644,9 +4650,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -4654,6 +4659,9 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -4672,6 +4680,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -5060,42 +5070,43 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -5110,8 +5121,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -5141,43 +5151,38 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s6 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -5187,49 +5192,40 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5238,6 +5234,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -5252,6 +5261,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; SI-NEXT: s_mov_b32 s2, s5 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5267,84 +5277,70 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: s_mov_b32 s11, 0xe80000 ; VI-NEXT: s_add_u32 s8, s8, s6 ; VI-NEXT: s_addc_u32 s9, s9, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e -; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 -; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e +; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 +; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5352,6 +5348,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -5366,6 +5375,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; VI-NEXT: s_mov_b32 s2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5380,83 +5390,69 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5464,6 +5460,18 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -5478,8 +5486,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5 +; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-MUBUF-NEXT: ; return to shader part epilog @@ -5491,10 +5500,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -5505,8 +5514,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -5528,7 +5535,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -5536,24 +5542,25 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -5561,9 +5568,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -5571,6 +5577,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -5589,6 +5597,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -5602,10 +5612,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -5616,8 +5626,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -5639,7 +5647,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -5647,24 +5654,25 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -5672,9 +5680,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -5682,6 +5689,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -5700,6 +5709,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -6093,10 +6104,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 @@ -6105,29 +6116,31 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -6142,8 +6155,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -6172,43 +6184,38 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s6 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e -; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 -; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 -; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 -; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f ; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 +; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e +; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 +; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 +; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -6218,49 +6225,40 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6269,6 +6267,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -6283,6 +6294,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 +; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; SI-NEXT: s_mov_b32 s2, s5 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6298,84 +6310,70 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: s_mov_b32 s11, 0xe80000 ; VI-NEXT: s_add_u32 s8, s8, s6 ; VI-NEXT: s_addc_u32 s9, s9, 0 +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6383,6 +6381,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -6397,6 +6408,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 +; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; VI-NEXT: s_mov_b32 s2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6411,83 +6423,69 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6495,6 +6493,18 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -6509,8 +6519,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5 +; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-MUBUF-NEXT: ; return to shader part epilog @@ -6522,10 +6533,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -6536,8 +6547,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -6559,7 +6568,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -6567,24 +6575,25 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -6592,9 +6601,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -6602,6 +6610,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -6620,6 +6630,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 +; GFX10_W32-MUBUF-NEXT: s_clause 0x1 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -6633,10 +6645,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -6647,8 +6659,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -6670,7 +6680,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -6678,24 +6687,25 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -6703,9 +6713,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -6713,6 +6722,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -6731,6 +6742,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 +; GFX10_W64-MUBUF-NEXT: s_clause 0x1 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -7124,10 +7137,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 @@ -7136,29 +7149,31 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 +; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 -; GFX11-FLATSCR-NEXT: s_clause 0x3 +; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -7173,8 +7188,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll index 71e4755b58bf2..c90d7887f2ff6 100644 --- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll +++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -3,9 +3,6 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) { ; CHECK-LABEL: excess_soft_clause_reg_pressure: ; CHECK: BB0_1: ; %for.cond28.preheader -; CHECK: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 - ; CHECK: global_load_dword ; CHECK-NEXT: global_load_dword ; CHECK-NEXT: global_load_dword @@ -18,11 +15,23 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspa ; CHECK-NOT: v_readlane_b32 ; CHECK: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 + +; CHECK-NOT: v_writelane_b32 +; CHECK-NOT: v_readlane_b32 + ; CHECK: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 + +; CHECK-NOT: v_writelane_b32 +; CHECK-NOT: v_readlane_b32 + ; CHECK: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 ; CHECK-NOT: v_writelane_b32 ; CHECK-NOT: v_readlane_b32 + entry: %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %i2 = load i64, ptr addrspace(4) %i, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index da48af100d27b..1a0f75e048cb9 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -448,13 +448,13 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3] -; GFX90A-NEXT: s_nop 4 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] ; GFX90A-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 50056b62b3397..b5474b8974b29 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10314,7 +10314,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 @@ -10327,12 +10328,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112 @@ -10344,7 +10343,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 @@ -10358,10 +10359,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39] ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 @@ -10468,13 +10466,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:160 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 9cb22dad86b88..802de8037cf6b 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -295,9 +295,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v40, s16, 2 -; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: v_writelane_b32 v40, s34, 3 ; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34 diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir index ee2e58f2a6cc1..a1771f9356014 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir @@ -98,28 +98,29 @@ body: | ; CHECK-LABEL: name: foo ; CHECK: liveins: $q0, $r0, $r1, $r2, $lr - ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr - ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 - ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 - ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg - ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 - ; CHECK: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2) - ; CHECK: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1) - ; CHECK: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0) - ; CHECK: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr { - ; CHECK: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr - ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4) - ; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4) - ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4) - ; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4) - ; CHECK: } - ; CHECK: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 { - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4) - ; CHECK: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4) - ; CHECK: } - ; CHECK: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK-NEXT: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK-NEXT: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2) + ; CHECK-NEXT: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1) + ; CHECK-NEXT: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0) + ; CHECK-NEXT: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr :: (load (s128) from %ir.src, align 4), (store (s128) into %ir.dest, align 4), (load (s128) from %ir.src2, align 4), (store (s128) into %ir.dest2, align 4) { + ; CHECK-NEXT: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr + ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4) + ; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4) + ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4) + ; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4) + ; CHECK-NEXT: } + ; CHECK-NEXT: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 :: (load (s128) from %ir.src3, align 4), (store (s128) into %ir.dest3, align 4) { + ; CHECK-NEXT: MVE_VPST 4, implicit $vpr + ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4) + ; CHECK-NEXT: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4) + ; CHECK-NEXT: } + ; CHECK-NEXT: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0 $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 From 66f52ca9a09c6e265385046f36ac51a863078948 Mon Sep 17 00:00:00 2001 From: Saleem Abdulrasool <compnerd@compnerd.org> Date: Tue, 4 Nov 2025 23:22:47 -0800 Subject: [PATCH 283/313] test: correct typo in RUN line (#166511) Correct a typo in the triple that is used for the test. Because the OS was not recognised, it would fall to the non-Windows code generation. --- llvm/test/CodeGen/AArch64/preserve_mostcc.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll index 75c8567e2095e..f77ada4eae022 100644 --- a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll +++ b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck -check-prefix CHECK -check-prefix CHECK-DARWIN %s -; RUN: llc < %s -mtriple=aarch64-unknown-windiws-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s +; RUN: llc < %s -mtriple=aarch64-unknown-windows-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s declare void @standard_cc_func() declare preserve_mostcc void @preserve_mostcc_func() From 9016c60c685e8f651fa9f9250703afe5f2c02565 Mon Sep 17 00:00:00 2001 From: Timm Baeder <tbaeder@redhat.com> Date: Wed, 5 Nov 2025 08:39:34 +0100 Subject: [PATCH 284/313] [clang] Call ActOnCaseExpr even if the 'case' is missing (#166326) This otherwise happens in ParseCaseExpression. If we don't call this, we don't perform the usual arithmetic conversions, etc. --- clang/lib/Parse/ParseStmt.cpp | 2 +- clang/test/SemaCXX/constant-expression-cxx14.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index 92038985f9163..fb45db1139349 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -813,7 +813,7 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx, return StmtError(); } } else { - LHS = Expr; + LHS = Actions.ActOnCaseExpr(CaseLoc, Expr); MissingCase = false; } diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp index bea90ff7eaf8a..1fc6e5ec4cc55 100644 --- a/clang/test/SemaCXX/constant-expression-cxx14.cpp +++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp @@ -1450,3 +1450,9 @@ namespace GH149500 { unsigned int * p = &(*(unsigned int *)0x400); static const void *q = &(*(const struct sysrq_key_op *)0); } + +constexpr bool missingCase() { + switch (1) { + 1u: return false; // expected-error {{expected 'case' keyword before expression}} + } +} From 98f0139f5bd9f8caafea5b7dc3ed27ec7ee4408f Mon Sep 17 00:00:00 2001 From: Karlo Basioli <basioli@google.com> Date: Wed, 5 Nov 2025 09:41:04 +0100 Subject: [PATCH 285/313] Fix bazel build issue caused by #166259 (#166519) --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index bcee6c4e243a4..ad013035251f5 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -2551,6 +2551,7 @@ llvm_target_lib_list = [lib for lib in [ "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc": ["-gen-register-info"], "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc": ["-gen-subtarget"], "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc": ["-gen-asm-matcher"], + "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc": ["-gen-sd-node-info"], }, }, { From 988c1b1c8dc87b5f373046c999d47707e358bdd0 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser <nikolasklauser@berlin.de> Date: Wed, 5 Nov 2025 09:42:35 +0100 Subject: [PATCH 286/313] [libc++] Remove <cstdlib> include from <exception> (#166340) --- libcxx/include/__exception/exception_ptr.h | 7 ++++--- libcxx/include/exception | 5 ++++- libcxx/test/libcxx/transitive_includes/cxx26.csv | 1 - 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h index e78126ea23852..aef036a2c9586 100644 --- a/libcxx/include/__exception/exception_ptr.h +++ b/libcxx/include/__exception/exception_ptr.h @@ -11,6 +11,7 @@ #include <__config> #include <__cstddef/nullptr_t.h> +#include <__cstddef/size_t.h> #include <__exception/operations.h> #include <__memory/addressof.h> #include <__memory/construct_at.h> @@ -18,7 +19,7 @@ #include <__type_traits/is_pointer.h> #include <__utility/move.h> #include <__utility/swap.h> -#include <cstdlib> +#include <__verbose_abort> #include <typeinfo> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -35,7 +36,7 @@ _LIBCPP_PUSH_MACROS namespace __cxxabiv1 { extern "C" { -_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(size_t) throw(); +_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(std::size_t) throw(); _LIBCPP_OVERRIDABLE_FUNC_VIS void __cxa_free_exception(void*) throw(); struct __cxa_exception; @@ -174,7 +175,7 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT { # else // !_LIBCPP_HAS_EXCEPTIONS template <class _Ep> _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep) _NOEXCEPT { - std::abort(); + _LIBCPP_VERBOSE_ABORT("make_exception_ptr was called in -fno-exceptions mode"); } # endif // _LIBCPP_HAS_EXCEPTIONS diff --git a/libcxx/include/exception b/libcxx/include/exception index 74229cd16c006..0b2372e571e99 100644 --- a/libcxx/include/exception +++ b/libcxx/include/exception @@ -93,10 +93,13 @@ template <class E> void rethrow_if_nested(const E& e); # if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include <cstddef> -# include <cstdlib> # include <new> # include <type_traits> # endif + +# if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 23 +# include <cstdlib> +# endif #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) #endif // _LIBCPP_EXCEPTION diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index d047b29b63cc6..8c3e1f0a97dfe 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -245,7 +245,6 @@ deque stdexcept deque tuple deque version exception cstdint -exception cstdlib exception typeinfo exception version execution version From 51d0f6d6e172c133e1fbd87447bba49a2ad7e6fe Mon Sep 17 00:00:00 2001 From: mitchell <mitchell.xu2@gmail.com> Date: Wed, 5 Nov 2025 16:43:27 +0800 Subject: [PATCH 287/313] [clang-tidy] Rename `cert-dcl58-cpp` to `bugprone-std-namespace-modification` (#165659) Closes [#157290](https://github.com/llvm/llvm-project/issues/157290) --- .../bugprone/BugproneTidyModule.cpp | 3 + .../clang-tidy/bugprone/CMakeLists.txt | 1 + .../StdNamespaceModificationCheck.cpp} | 10 +-- .../StdNamespaceModificationCheck.h} | 16 ++--- .../clang-tidy/cert/CERTTidyModule.cpp | 5 +- .../clang-tidy/cert/CMakeLists.txt | 1 - clang-tools-extra/docs/ReleaseNotes.rst | 5 ++ .../bugprone/std-namespace-modification.rst | 63 +++++++++++++++++++ .../docs/clang-tidy/checks/cert/dcl58-cpp.rst | 54 +--------------- .../docs/clang-tidy/checks/list.rst | 3 +- .../Inputs/Headers/system-header-simulation.h | 2 +- .../std-namespace-modification.cpp} | 4 +- 12 files changed, 96 insertions(+), 71 deletions(-) rename clang-tools-extra/clang-tidy/{cert/DontModifyStdNamespaceCheck.cpp => bugprone/StdNamespaceModificationCheck.cpp} (95%) rename clang-tools-extra/clang-tidy/{cert/DontModifyStdNamespaceCheck.h => bugprone/StdNamespaceModificationCheck.h} (61%) create mode 100644 clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst rename clang-tools-extra/test/clang-tidy/checkers/{cert/dcl58-cpp.cpp => bugprone/std-namespace-modification.cpp} (97%) diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp index 2e21a4c4fd1f9..611717a64b768 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp @@ -73,6 +73,7 @@ #include "SizeofExpressionCheck.h" #include "SpuriouslyWakeUpFunctionsCheck.h" #include "StandaloneEmptyCheck.h" +#include "StdNamespaceModificationCheck.h" #include "StringConstructorCheck.h" #include "StringIntegerAssignmentCheck.h" #include "StringLiteralWithEmbeddedNulCheck.h" @@ -237,6 +238,8 @@ class BugproneModule : public ClangTidyModule { "bugprone-spuriously-wake-up-functions"); CheckFactories.registerCheck<StandaloneEmptyCheck>( "bugprone-standalone-empty"); + CheckFactories.registerCheck<StdNamespaceModificationCheck>( + "bugprone-std-namespace-modification"); CheckFactories.registerCheck<StringConstructorCheck>( "bugprone-string-constructor"); CheckFactories.registerCheck<StringIntegerAssignmentCheck>( diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt index 31a0e6906866a..6c96996458040 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt @@ -75,6 +75,7 @@ add_clang_library(clangTidyBugproneModule STATIC SmartPtrArrayMismatchCheck.cpp SpuriouslyWakeUpFunctionsCheck.cpp StandaloneEmptyCheck.cpp + StdNamespaceModificationCheck.cpp StringConstructorCheck.cpp StringIntegerAssignmentCheck.cpp StringLiteralWithEmbeddedNulCheck.cpp diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp similarity index 95% rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp index 79fbc66b5f8a3..13e5c03d7c4d3 100644 --- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "DontModifyStdNamespaceCheck.h" +#include "StdNamespaceModificationCheck.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchersInternal.h" @@ -36,9 +36,9 @@ AST_POLYMORPHIC_MATCHER_P( } // namespace -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { -void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) { +void StdNamespaceModificationCheck::registerMatchers(MatchFinder *Finder) { auto HasStdParent = hasDeclContext(namespaceDecl(hasAnyName("std", "posix"), unless(hasParent(namespaceDecl()))) @@ -96,7 +96,7 @@ void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) { .bind("decl"), this); } -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) { const NamespaceDecl *LastNS = nullptr; @@ -108,7 +108,7 @@ static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) { return LastNS; } -void clang::tidy::cert::DontModifyStdNamespaceCheck::check( +void clang::tidy::bugprone::StdNamespaceModificationCheck::check( const MatchFinder::MatchResult &Result) { const auto *D = Result.Nodes.getNodeAs<Decl>("decl"); const auto *NS = Result.Nodes.getNodeAs<NamespaceDecl>("nmspc"); diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h similarity index 61% rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h index cfcd878644ddb..0f62dc3d9ab70 100644 --- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h @@ -6,21 +6,21 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { /// Modification of the std or posix namespace can result in undefined behavior. /// This check warns for such modifications. /// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/cert/dcl58-cpp.html -class DontModifyStdNamespaceCheck : public ClangTidyCheck { +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/std-namespace-modification.html +class StdNamespaceModificationCheck : public ClangTidyCheck { public: - DontModifyStdNamespaceCheck(StringRef Name, ClangTidyContext *Context) + StdNamespaceModificationCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context) {} bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus; @@ -29,6 +29,6 @@ class DontModifyStdNamespaceCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; }; -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp index d5179770008e5..15592d5c03c06 100644 --- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp @@ -19,6 +19,7 @@ #include "../bugprone/SignedCharMisuseCheck.h" #include "../bugprone/SizeofExpressionCheck.h" #include "../bugprone/SpuriouslyWakeUpFunctionsCheck.h" +#include "../bugprone/StdNamespaceModificationCheck.h" #include "../bugprone/SuspiciousMemoryComparisonCheck.h" #include "../bugprone/ThrowingStaticInitializationCheck.h" #include "../bugprone/UncheckedStringToNumberConversionCheck.h" @@ -36,7 +37,6 @@ #include "../performance/MoveConstructorInitCheck.h" #include "../readability/EnumInitialValueCheck.h" #include "../readability/UppercaseLiteralSuffixCheck.h" -#include "DontModifyStdNamespaceCheck.h" #include "FloatLoopCounter.h" #include "LimitedRandomnessCheck.h" #include "MutatingCopyCheck.h" @@ -251,7 +251,8 @@ class CERTModule : public ClangTidyModule { "cert-dcl51-cpp"); CheckFactories.registerCheck<misc::NewDeleteOverloadsCheck>( "cert-dcl54-cpp"); - CheckFactories.registerCheck<DontModifyStdNamespaceCheck>("cert-dcl58-cpp"); + CheckFactories.registerCheck<bugprone::StdNamespaceModificationCheck>( + "cert-dcl58-cpp"); CheckFactories.registerCheck<google::build::UnnamedNamespaceInHeaderCheck>( "cert-dcl59-cpp"); // ERR diff --git a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt index db3b2f5a08286..27cf392ce0bb1 100644 --- a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt @@ -5,7 +5,6 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangTidyCERTModule STATIC CERTTidyModule.cpp - DontModifyStdNamespaceCheck.cpp FloatLoopCounter.cpp LimitedRandomnessCheck.cpp MutatingCopyCheck.cpp diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 26f00e9a8a294..88eb0ebff4501 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -244,6 +244,11 @@ New check aliases <clang-tidy/checks/modernize/avoid-variadic-functions>` keeping initial check as an alias to the new one. +- Renamed :doc:`cert-dcl58-cpp <clang-tidy/checks/cert/dcl58-cpp>` to + :doc:`bugprone-std-namespace-modification + <clang-tidy/checks/bugprone/std-namespace-modification>` + keeping initial check as an alias to the new one. + - Renamed :doc:`cert-env33-c <clang-tidy/checks/cert/env33-c>` to :doc:`bugprone-command-processor <clang-tidy/checks/bugprone/command-processor>` diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst new file mode 100644 index 0000000000000..c6e5608280264 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst @@ -0,0 +1,63 @@ +.. title:: clang-tidy - bugprone-std-namespace-modification + +bugprone-std-namespace-modification +=================================== + +Warns on modifications of the ``std`` or ``posix`` namespaces which can +result in undefined behavior. + +The ``std`` (or ``posix``) namespace is allowed to be extended with (class or +function) template specializations that depend on an user-defined type (a type +that is not defined in the standard system headers). + +The check detects the following (user provided) declarations in namespace ``std`` or ``posix``: + +- Anything that is not a template specialization. +- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument. +- Explicit specializations of any member function of a standard library class template. +- Explicit specializations of any member function template of a standard library class or class template. +- Explicit or partial specialization of any member class template of a standard library class or class template. + +Examples: + +.. code-block:: c++ + + namespace std { + int x; // warning: modification of 'std' namespace can result in undefined behavior [bugprone-dont-modify-std-namespace] + } + + namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior + } + + template <> + struct ::std::hash<long> { // warning: modification of 'std' namespace can result in undefined behavior + unsigned long operator()(const long &K) const { + return K; + } + }; + + struct MyData { long data; }; + + template <> + struct ::std::hash<MyData> { // no warning: specialization with user-defined type + unsigned long operator()(const MyData &K) const { + return K.data; + } + }; + + namespace std { + template <> + void swap<bool>(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior + + template <> + bool less<void>::operator()<MyData &&, MyData &&>(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior + return true; + } + } + +References +---------- + +This check corresponds to the CERT C++ Coding Standard rule +`DCL58-CPP. Do not modify the standard namespaces +<https://www.securecoding.cert.org/confluence/display/cplusplus/DCL58-CPP.+Do+not+modify+the+standard+namespaces>`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst index fbcc6281a8898..1b8c2c4f97dde 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst @@ -3,57 +3,9 @@ cert-dcl58-cpp ============== -Modification of the ``std`` or ``posix`` namespace can result in undefined -behavior. -This check warns for such modifications. -The ``std`` (or ``posix``) namespace is allowed to be extended with (class or -function) template specializations that depend on an user-defined type (a type -that is not defined in the standard system headers). - -The check detects the following (user provided) declarations in namespace ``std`` or ``posix``: - -- Anything that is not a template specialization. -- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument. -- Explicit specializations of any member function of a standard library class template. -- Explicit specializations of any member function template of a standard library class or class template. -- Explicit or partial specialization of any member class template of a standard library class or class template. - -Examples: - -.. code-block:: c++ - - namespace std { - int x; // warning: modification of 'std' namespace can result in undefined behavior [cert-dcl58-cpp] - } - - namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior - } - - template <> - struct ::std::hash<long> { // warning: modification of 'std' namespace can result in undefined behavior - unsigned long operator()(const long &K) const { - return K; - } - }; - - struct MyData { long data; }; - - template <> - struct ::std::hash<MyData> { // no warning: specialization with user-defined type - unsigned long operator()(const MyData &K) const { - return K.data; - } - }; - - namespace std { - template <> - void swap<bool>(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior - - template <> - bool less<void>::operator()<MyData &&, MyData &&>(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior - return true; - } - } +The `cert-dcl58-cpp` is an aliaes, please see +`bugprone-std-namespace-modification <../bugprone/std-namespace-modification.html>`_ +for more information. This check corresponds to the CERT C++ Coding Standard rule `DCL58-CPP. Do not modify the standard namespaces diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index e14ac715cfeeb..5094bcc90caf3 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -141,6 +141,7 @@ Clang-Tidy Checks :doc:`bugprone-sizeof-expression <bugprone/sizeof-expression>`, :doc:`bugprone-spuriously-wake-up-functions <bugprone/spuriously-wake-up-functions>`, :doc:`bugprone-standalone-empty <bugprone/standalone-empty>`, "Yes" + :doc:`bugprone-std-namespace-modification <bugprone/std-namespace-modification>`, :doc:`bugprone-string-constructor <bugprone/string-constructor>`, "Yes" :doc:`bugprone-string-integer-assignment <bugprone/string-integer-assignment>`, "Yes" :doc:`bugprone-string-literal-with-embedded-nul <bugprone/string-literal-with-embedded-nul>`, @@ -175,7 +176,6 @@ Clang-Tidy Checks :doc:`bugprone-unused-return-value <bugprone/unused-return-value>`, :doc:`bugprone-use-after-move <bugprone/use-after-move>`, :doc:`bugprone-virtual-near-miss <bugprone/virtual-near-miss>`, "Yes" - :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`, :doc:`cert-err33-c <cert/err33-c>`, :doc:`cert-err60-cpp <cert/err60-cpp>`, :doc:`cert-flp30-c <cert/flp30-c>`, @@ -441,6 +441,7 @@ Check aliases :doc:`cert-dcl50-cpp <cert/dcl50-cpp>`, :doc:`modernize-avoid-variadic-functions <modernize/avoid-variadic-functions>`, :doc:`cert-dcl51-cpp <cert/dcl51-cpp>`, :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes" :doc:`cert-dcl54-cpp <cert/dcl54-cpp>`, :doc:`misc-new-delete-overloads <misc/new-delete-overloads>`, + :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`, :doc:`bugprone-std-namespace-modification <bugprone/std-namespace-modification>`, :doc:`cert-dcl59-cpp <cert/dcl59-cpp>`, :doc:`google-build-namespaces <google/build-namespaces>`, :doc:`cert-env33-c <cert/env33-c>`, :doc:`bugprone-command-processor <bugprone/command-processor>`, :doc:`cert-err09-cpp <cert/err09-cpp>`, :doc:`misc-throw-by-value-catch-by-reference <misc/throw-by-value-catch-by-reference>`, diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h index b6977cd9ce6c6..0870f60eaa39b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h +++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h @@ -59,7 +59,7 @@ struct X {}; } // namespace std // Template specializations that are in a system-header file. -// The purpose is to test cert-dcl58-cpp (no warnings here). +// The purpose is to test bugprone-std-namespace-modification (no warnings here). namespace std { template <> void swap<short>(short &, short &){}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp similarity index 97% rename from clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp index 01964e7dc6c76..32bcbcaa21c0d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++17-or-later %s cert-dcl58-cpp %t -- -- -I %clang_tidy_headers +// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-std-namespace-modification %t -- -- -I %clang_tidy_headers #include "system-header-simulation.h" @@ -15,7 +15,7 @@ namespace A { } namespace posix { -// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [cert-dcl58-cpp] +// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [bugprone-std-namespace-modification] // CHECK-MESSAGES: :[[@LINE-2]]:11: note: 'posix' namespace opened here namespace foo { int foobar; From 5b5d0a831967ea49a3b4642923308d08fb69b06c Mon Sep 17 00:00:00 2001 From: Nikolas Klauser <nikolasklauser@berlin.de> Date: Wed, 5 Nov 2025 10:00:47 +0100 Subject: [PATCH 288/313] [libc++][NFC] Make __type_info_implementations a namespace (#166339) There doesn't seem much of a reason why this should be a struct. Make it a namespace instead. --- libcxx/include/typeinfo | 160 ++++++++++++++++++++-------------------- 1 file changed, 80 insertions(+), 80 deletions(-) diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo index 24aaabf0a87df..f608c94d3031e 100644 --- a/libcxx/include/typeinfo +++ b/libcxx/include/typeinfo @@ -186,99 +186,99 @@ public: # endif # endif -struct __type_info_implementations { - struct __string_impl_base { - typedef const char* __type_name_t; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char* - __type_name_to_string(__type_name_t __v) _NOEXCEPT { - return __v; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t - __string_to_type_name(const char* __v) _NOEXCEPT { - return __v; - } - }; +namespace __type_info_implementations { +struct __string_impl_base { + typedef const char* __type_name_t; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char* + __type_name_to_string(__type_name_t __v) _NOEXCEPT { + return __v; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t + __string_to_type_name(const char* __v) _NOEXCEPT { + return __v; + } +}; - struct __unique_impl : __string_impl_base { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { - return reinterpret_cast<size_t>(__v); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __lhs == __rhs; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __lhs < __rhs; - } - }; - - struct __non_unique_impl : __string_impl_base { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT { - size_t __hash = 5381; - while (unsigned char __c = static_cast<unsigned char>(*__ptr++)) - __hash = (__hash * 33) ^ __c; - return __hash; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __builtin_strcmp(__lhs, __rhs) < 0; - } - }; +struct __unique_impl : __string_impl_base { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { + return reinterpret_cast<size_t>(__v); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __lhs == __rhs; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __lhs < __rhs; + } +}; + +struct __non_unique_impl : __string_impl_base { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT { + size_t __hash = 5381; + while (unsigned char __c = static_cast<unsigned char>(*__ptr++)) + __hash = (__hash * 33) ^ __c; + return __hash; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __builtin_strcmp(__lhs, __rhs) < 0; + } +}; - struct __non_unique_arm_rtti_bit_impl { - typedef uintptr_t __type_name_t; +struct __non_unique_arm_rtti_bit_impl { + typedef uintptr_t __type_name_t; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT { - return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT { - return reinterpret_cast<__type_name_t>(__v); - } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT { + return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT { + return reinterpret_cast<__type_name_t>(__v); + } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { - if (__is_type_name_unique(__v)) - return __v; - return __non_unique_impl::__hash(__type_name_to_string(__v)); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - if (__lhs == __rhs) - return true; - if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) - // Either both are unique and have a different address, or one of them - // is unique and the other one isn't. In both cases they are unequal. - return false; - return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) - return __lhs < __rhs; - return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0; - } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { + if (__is_type_name_unique(__v)) + return __v; + return __non_unique_impl::__hash(__type_name_to_string(__v)); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + if (__lhs == __rhs) + return true; + if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) + // Either both are unique and have a different address, or one of them + // is unique and the other one isn't. In both cases they are unequal. + return false; + return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) + return __lhs < __rhs; + return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0; + } - private: - // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when - // this implementation is actually used. - typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))> - __non_unique_rtti_bit; +private: + // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when + // this implementation is actually used. + typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))> + __non_unique_rtti_bit; - _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT { - return !(__lhs & __non_unique_rtti_bit::value); - } - }; + _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT { + return !(__lhs & __non_unique_rtti_bit::value); + } +}; - typedef +typedef # if _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 1 - __unique_impl + __unique_impl # elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 2 - __non_unique_impl + __non_unique_impl # elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 3 - __non_unique_arm_rtti_bit_impl + __non_unique_arm_rtti_bit_impl # else # error invalid configuration for _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION # endif - __impl; -}; + __impl; +} // namespace __type_info_implementations # if __has_cpp_attribute(_Clang::__ptrauth_vtable_pointer__) # if __has_feature(ptrauth_type_info_vtable_pointer_discrimination) From 628d53aba53204b8b7eac69b200b04bc4433deac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Spaits?= <gaborspaits1@gmail.com> Date: Wed, 5 Nov 2025 10:04:32 +0100 Subject: [PATCH 289/313] [InstCombine] Enable FoldOpIntoSelect and foldOpIntoPhi when the Op's other parameter is non-const (#166102) This patch enables `FoldOpIntoSelect` and `foldOpIntoPhi` for the cases when Op's second parameter is a non-constant. It doesn't seem to bring significant improvements, but the compile time impact is neglegable. --- .../InstCombine/InstructionCombining.cpp | 6 +- .../InstCombine/binop-phi-operands.ll | 5 +- .../Transforms/InstCombine/binop-select.ll | 187 +++++++++++++++++- .../InstCombine/dont-distribute-phi.ll | 10 +- llvm/test/Transforms/InstCombine/fmul.ll | 4 +- .../Transforms/InstCombine/free-inversion.ll | 4 +- .../InstCombine/or-select-zero-icmp.ll | 169 ++++++++++++++++ .../test/Transforms/InstCombine/recurrence.ll | 4 +- llvm/test/Transforms/InstCombine/sub-gep.ll | 3 +- .../AArch64/predicated-reduction.ll | 105 +++++----- 10 files changed, 424 insertions(+), 73 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 67f837c7ed968..b158e0f626850 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2261,11 +2261,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { } Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { - if (!isa<Constant>(I.getOperand(1))) - return nullptr; + bool IsOtherParamConst = isa<Constant>(I.getOperand(1)); if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) { - if (Instruction *NewSel = FoldOpIntoSelect(I, Sel)) + if (Instruction *NewSel = + FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst)) return NewSel; } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) { if (Instruction *NewPhi = foldOpIntoPhi(I, PN)) diff --git a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll index 9e049837b0352..f0d4ad74fbe05 100644 --- a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll +++ b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll @@ -653,12 +653,11 @@ define i8 @mul_const_incoming0_speculatable(i1 %b, i8 %x, i8 %y) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[B:%.*]], label [[IF:%.*]], label [[THEN:%.*]] ; CHECK: if: +; CHECK-NEXT: [[TMP0:%.*]] = mul i8 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: br label [[THEN]] ; CHECK: then: -; CHECK-NEXT: [[P0:%.*]] = phi i8 [ 42, [[ENTRY:%.*]] ], [ [[X:%.*]], [[IF]] ] -; CHECK-NEXT: [[P1:%.*]] = phi i8 [ 17, [[ENTRY]] ], [ [[Y:%.*]], [[IF]] ] +; CHECK-NEXT: [[R:%.*]] = phi i8 [ -54, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF]] ] ; CHECK-NEXT: call void @sideeffect() -; CHECK-NEXT: [[R:%.*]] = mul i8 [[P0]], [[P1]] ; CHECK-NEXT: ret i8 [[R]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/binop-select.ll b/llvm/test/Transforms/InstCombine/binop-select.ll index 25f624ee13412..fe1ec9014f188 100644 --- a/llvm/test/Transforms/InstCombine/binop-select.ll +++ b/llvm/test/Transforms/InstCombine/binop-select.ll @@ -335,7 +335,7 @@ define i32 @sub_sel_op1_use(i1 %b) { define float @fadd_sel_op0(i1 %b, float %x) { ; CHECK-LABEL: @fadd_sel_op0( -; CHECK-NEXT: [[R:%.*]] = select nnan i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000 +; CHECK-NEXT: [[R:%.*]] = select i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000 ; CHECK-NEXT: ret float [[R]] ; %s = select i1 %b, float 0xFFF0000000000000, float 0x7FF0000000000000 @@ -403,3 +403,188 @@ define i32 @ashr_sel_op1_use(i1 %b) { %r = ashr i32 -2, %s ret i32 %r } + +define i8 @commonArgWithOr0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithOr0( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 0, i8 8 +; CHECK-NEXT: [[V2:%.*]] = or disjoint i8 [[V1]], [[V0]] +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 0, i8 8 + %v2 = or i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithOr1(i1 %arg0) { +; CHECK-LABEL: @commonArgWithOr1( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 17, i8 23 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 1, i8 7 + %v2 = or i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithOr2(i1 %arg0) { +; CHECK-LABEL: @commonArgWithOr2( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 5, i8 42 +; CHECK-NEXT: [[V2:%.*]] = or i8 [[V1]], [[V0]] +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 42 + %v2 = or i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd0( +; CHECK-NEXT: ret i8 16 +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 0, i8 8 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd1(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd1( +; CHECK-NEXT: ret i8 16 +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 8, i8 1 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd2(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd2( +; CHECK-NEXT: [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 1, i8 7 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd3(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd3( +; CHECK-NEXT: [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 42 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithXor0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor0( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 0, i8 8 +; CHECK-NEXT: [[V2:%.*]] = or disjoint i8 [[V1]], [[V0]] +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 0, i8 8 + %v2 = xor i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithXor1(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor1( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 9, i8 1 +; CHECK-NEXT: [[V2:%.*]] = xor i8 [[V1]], [[V0]] +; CHECK-NEXT: ret i8 [[V2]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 9, i8 1 + %v2 = xor i8 %v1, %v0 + ret i8 %v2 +} + +define i8 @commonArgWithXor2(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor2( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 1, i8 7 +; CHECK-NEXT: [[V2:%.*]] = xor i8 [[V1]], [[V0]] +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 1, i8 7 + %v2 = xor i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithXor3(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor3( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 5, i8 45 +; CHECK-NEXT: [[V2:%.*]] = xor i8 [[V1]], [[V0]] +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 45 + %v2 = xor i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAdd0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAdd0( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 22, i8 61 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 45 + %v2 = add i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i32 @OrSelectIcmpZero(i32 %a, i32 %b) { +; CHECK-LABEL: @OrSelectIcmpZero( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[OR:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %a + ret i32 %or +} + +define i32 @OrSelectIcmpNonZero(i32 %a, i32 %b) { +; CHECK-LABEL: @OrSelectIcmpNonZero( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 42 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[A]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 42 + %or = or i32 %sel, %a + ret i32 %or +} diff --git a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll index 45e47d8e781be..5e90d4b8d4419 100644 --- a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll +++ b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll @@ -7,7 +7,7 @@ define zeroext i1 @foo(i32 %arg) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37 ; CHECK-NEXT: br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]] ; CHECK: bb_then: ; CHECK-NEXT: call void @bar() @@ -16,8 +16,7 @@ define zeroext i1 @foo(i32 %arg) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17 ; CHECK-NEXT: br label [[BB_EXIT]] ; CHECK: bb_exit: -; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ] -; CHECK-NEXT: [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]] +; CHECK-NEXT: [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ] ; CHECK-NEXT: ret i1 [[AND1]] ; @@ -43,7 +42,7 @@ bb_exit: define zeroext i1 @foo_logical(i32 %arg) { ; CHECK-LABEL: @foo_logical( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37 ; CHECK-NEXT: br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]] ; CHECK: bb_then: ; CHECK-NEXT: call void @bar() @@ -52,8 +51,7 @@ define zeroext i1 @foo_logical(i32 %arg) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17 ; CHECK-NEXT: br label [[BB_EXIT]] ; CHECK: bb_exit: -; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ] -; CHECK-NEXT: [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]] +; CHECK-NEXT: [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ] ; CHECK-NEXT: ret i1 [[AND1]] ; diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll index cd4a8e36c6e23..3cbf7090a13b8 100644 --- a/llvm/test/Transforms/InstCombine/fmul.ll +++ b/llvm/test/Transforms/InstCombine/fmul.ll @@ -1222,7 +1222,7 @@ define <2 x double> @negate_if_true_wrong_constant(<2 x double> %px, i1 %cond) { ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0 define float @fmul_select(float %x, i1 %c) { ; CHECK-LABEL: @fmul_select( -; CHECK-NEXT: [[MUL:%.*]] = select fast i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00 +; CHECK-NEXT: [[MUL:%.*]] = select i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00 ; CHECK-NEXT: ret float [[MUL]] ; %sel = select i1 %c, float 1.0, float 0.0 @@ -1233,7 +1233,7 @@ define float @fmul_select(float %x, i1 %c) { ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0 define <2 x float> @fmul_select_vec(<2 x float> %x, i1 %c) { ; CHECK-LABEL: @fmul_select_vec( -; CHECK-NEXT: [[MUL:%.*]] = select fast i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer +; CHECK-NEXT: [[MUL:%.*]] = select i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer ; CHECK-NEXT: ret <2 x float> [[MUL]] ; %sel = select i1 %c, <2 x float> <float 1.0, float 1.0>, <2 x float> zeroinitializer diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll index 4b69a5e77b4ce..2e8e75c3ab3ef 100644 --- a/llvm/test/Transforms/InstCombine/free-inversion.ll +++ b/llvm/test/Transforms/InstCombine/free-inversion.ll @@ -563,10 +563,10 @@ define i1 @test_inv_free(i1 %c1, i1 %c2, i1 %c3, i1 %c4) { ; CHECK: b2: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: b3: +; CHECK-NEXT: [[TMP0:%.*]] = and i1 [[C3:%.*]], [[C4:%.*]] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[VAL_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ true, [[B2]] ], [ [[C3:%.*]], [[B3]] ] -; CHECK-NEXT: [[COND_NOT:%.*]] = and i1 [[VAL_NOT]], [[C4:%.*]] +; CHECK-NEXT: [[COND_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ [[C4]], [[B2]] ], [ [[TMP0]], [[B3]] ] ; CHECK-NEXT: br i1 [[COND_NOT]], label [[B5:%.*]], label [[B4:%.*]] ; CHECK: b4: ; CHECK-NEXT: ret i1 true diff --git a/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll new file mode 100644 index 0000000000000..a3b21ccc63e94 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll @@ -0,0 +1,169 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +; Basic functional test +define i32 @basic(i32 %a, i32 %b) { +; CHECK-LABEL: @basic( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %a + ret i32 %or +} + +; Operand order swap test +define i32 @swap_operand_order(i32 %x, i32 %y) { +; CHECK-LABEL: @swap_operand_order( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[RES:%.*]] = or i32 [[X]], [[SEL]] +; CHECK-NEXT: ret i32 [[RES]] +; + %cmp = icmp eq i32 %x, 0 + %sel = select i1 %cmp, i32 %y, i32 0 + %or = or i32 %x, %sel + ret i32 %or +} + +; Negative test: Non-zero false value in select +define i32 @negative_non_zero_false_val(i32 %a, i32 %b) { +; CHECK-LABEL: @negative_non_zero_false_val( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 1 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[A]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 1 + %or = or i32 %sel, %a + ret i32 %or +} + +; Negative test: Incorrect comparison predicate (NE) +define i32 @negative_wrong_predicate(i32 %a, i32 %b) { +; CHECK-LABEL: @negative_wrong_predicate( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[OR:%.*]] = select i1 [[CMP]], i32 0, i32 [[TMP1:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[OR]], [[A]] +; CHECK-NEXT: ret i32 [[OR1]] +; + %cmp = icmp ne i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %a + ret i32 %or +} + +; Comparison direction swap test (0 == X) +define i32 @cmp_swapped(i32 %x, i32 %y) { +; CHECK-LABEL: @cmp_swapped( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[RES:%.*]] = or i32 [[X]], [[SEL]] +; CHECK-NEXT: ret i32 [[RES]] +; + %cmp = icmp eq i32 0, %x + %sel = select i1 %cmp, i32 %y, i32 0 + %or = or i32 %x, %sel + ret i32 %or +} + +; Complex expression test +define i32 @complex_expression(i32 %a, i32 %b) { +; CHECK-LABEL: @complex_expression( +; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], 0 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[X]] +; CHECK-NEXT: ret i32 [[RES]] +; + %x = add i32 %a, 1 + %cmp = icmp eq i32 %x, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %x + ret i32 %or +} + +; zext test +define i32 @zext_cond(i8 %a, i32 %b) { +; CHECK-LABEL: @zext_cond( +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[A:%.*]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[Z]] +; CHECK-NEXT: ret i32 [[OR]] +; + %z = zext i8 %a to i32 + %cmp = icmp eq i8 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %z + ret i32 %or +} + +; sext test +define i32 @sext_cond(i8 %a, i32 %b) { +; CHECK-LABEL: @sext_cond( +; CHECK-NEXT: [[S:%.*]] = sext i8 [[A:%.*]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[S]] +; CHECK-NEXT: ret i32 [[OR]] +; + %s = sext i8 %a to i32 + %cmp = icmp eq i8 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %s + ret i32 %or +} + +; Vector type test +define <2 x i32> @vector_type(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: @vector_type( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[A:%.*]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[B:%.*]], <2 x i32> [[A]] +; CHECK-NEXT: ret <2 x i32> [[RES]] +; + %cmp = icmp eq <2 x i32> %a, zeroinitializer + %sel = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> zeroinitializer + %or = or <2 x i32> %sel, %a + ret <2 x i32> %or +} + +; Pointer type test (should not trigger optimization) +define ptr @pointer_type(ptr %p, ptr %q) { +; CHECK-LABEL: @pointer_type( +; CHECK-NEXT: [[A:%.*]] = ptrtoint ptr [[P:%.*]] to i64 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], ptr [[Q:%.*]], ptr null +; CHECK-NEXT: [[SEL_INT:%.*]] = ptrtoint ptr [[SEL]] to i64 +; CHECK-NEXT: [[OR:%.*]] = or i64 [[A]], [[SEL_INT]] +; CHECK-NEXT: [[RET:%.*]] = inttoptr i64 [[OR]] to ptr +; CHECK-NEXT: ret ptr [[RET]] +; + %a = ptrtoint ptr %p to i64 + %cmp = icmp eq i64 %a, 0 + %sel = select i1 %cmp, ptr %q, ptr null + %sel_int = ptrtoint ptr %sel to i64 + %or_val = or i64 %a, %sel_int + %ret = inttoptr i64 %or_val to ptr + ret ptr %ret +} + +; Multi-use test (should not trigger optimization) +define i32 @multi_use_test(i32 %x, i32 %m) { +; CHECK-LABEL: @multi_use_test( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[M:%.*]], i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[X]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL]], [[X]] +; CHECK-NEXT: [[O2:%.*]] = sub i32 [[OR]], [[ADD]] +; CHECK-NEXT: ret i32 [[O2]] +; + %cmp = icmp eq i32 %x, 0 + %sel = select i1 %cmp, i32 %m, i32 0 + %or = or i32 %sel, %x + %add = add i32 %sel, %x + %res = sub i32 %or, %add + ret i32 %res +} diff --git a/llvm/test/Transforms/InstCombine/recurrence.ll b/llvm/test/Transforms/InstCombine/recurrence.ll index f75e0d439c572..643e7efc243a3 100644 --- a/llvm/test/Transforms/InstCombine/recurrence.ll +++ b/llvm/test/Transforms/InstCombine/recurrence.ll @@ -24,9 +24,9 @@ loop: ; preds = %loop, %entry define i64 @test_or2(i64 %a, i64 %b) { ; CHECK-LABEL: @test_or2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: tail call void @use(i64 [[IV_NEXT]]) ; CHECK-NEXT: br label [[LOOP]] ; @@ -104,9 +104,9 @@ loop: ; preds = %loop, %entry define i64 @test_and2(i64 %a, i64 %b) { ; CHECK-LABEL: @test_and2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: tail call void @use(i64 [[IV_NEXT]]) ; CHECK-NEXT: br label [[LOOP]] ; diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll index ee70137e8fbd7..01da63fa5b0af 100644 --- a/llvm/test/Transforms/InstCombine/sub-gep.ll +++ b/llvm/test/Transforms/InstCombine/sub-gep.ll @@ -858,8 +858,7 @@ define i1 @_gep_phi2(ptr %str1, i64 %val2) { ; CHECK: while.end.i: ; CHECK-NEXT: br label [[_Z3FOOPKC_EXIT]] ; CHECK: _Z3fooPKc.exit: -; CHECK-NEXT: [[RETVAL_0_I:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ 0, [[LOR_LHS_FALSE_I]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[RETVAL_0_I]], [[VAL2:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ [[VAL2:%.*]], [[LOR_LHS_FALSE_I]] ], [ [[VAL2]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[TOBOOL]] ; diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll index 55adda7d5b0f3..08191c636bd3f 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll @@ -18,45 +18,45 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT19]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX1]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP23]], align 4 +; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI18:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double> -; CHECK-NEXT: [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD18]] to <4 x double> +; CHECK-NEXT: [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD19]] to <4 x double> ; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT20]] -; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT20]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT15]] +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT15]] ; CHECK-NEXT: [[TMP8:%.*]] = fcmp fast ogt <4 x double> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = fcmp fast ogt <4 x double> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <4 x double> [[TMP7]], [[TMP7]] ; CHECK-NEXT: [[TMP12:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP6]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP13:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP7]], <4 x double> splat (double -0.000000e+00) -; CHECK-NEXT: [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP12]] -; CHECK-NEXT: [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP13]] +; CHECK-NEXT: [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP12]] +; CHECK-NEXT: [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI18]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP10]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP17:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP11]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP18]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP16]] -; CHECK-NEXT: [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI15]], [[TMP17]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8 +; CHECK-NEXT: [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP17]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]] ; CHECK-NEXT: [[TMP21:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]]) -; CHECK-NEXT: [[BIN_RDX21:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]] -; CHECK-NEXT: [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX21]]) +; CHECK-NEXT: [[BIN_RDX20:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX20]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY_PREHEADER22]] ; CHECK: [[FOR_BODY_PREHEADER22]]: @@ -65,11 +65,11 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef ; CHECK-NEXT: [[V0_010_PH:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[TMP22]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ] +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ] ; CHECK-NEXT: [[V1_012:%.*]] = phi double [ [[V1_2:%.*]], %[[FOR_BODY]] ], [ [[V1_011_PH]], %[[FOR_BODY_PREHEADER22]] ] ; CHECK-NEXT: [[V0_011:%.*]] = phi double [ [[V0_2:%.*]], %[[FOR_BODY]] ], [ [[V0_010_PH]], %[[FOR_BODY_PREHEADER22]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[CONV:%.*]] = fpext float [[TMP0]] to double ; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[Y]], [[CONV]] ; CHECK-NEXT: [[SUB:%.*]] = fsub fast double [[MUL]], [[Z]] @@ -79,16 +79,16 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef ; CHECK-NEXT: [[V0_2]] = fadd reassoc arcp contract afn double [[V0_011]], [[ADD8]] ; CHECK-NEXT: [[ADD4:%.*]] = select ninf i1 [[CMP1]], double [[MUL3]], double -0.000000e+00 ; CHECK-NEXT: [[V1_2]] = fadd reassoc arcp contract afn double [[V1_012]], [[ADD4]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[FOR_END_LOOPEXIT]]: -; CHECK-NEXT: [[V0_1:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[V1_1:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = fadd fast double [[V1_1]], [[V0_1]] +; CHECK-NEXT: [[V0_1_LCSSA:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[V1_1_LCSSA:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP24:%.*]] = fadd fast double [[V1_1_LCSSA]], [[V0_1_LCSSA]] ; CHECK-NEXT: br label %[[FOR_END]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP24]], %[[FOR_END_LOOPEXIT]] ] ; CHECK-NEXT: ret double [[ADD5]] ; entry: @@ -193,29 +193,29 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT35]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT29]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br label %[[FOR_BODY_US:.*]] ; CHECK: [[FOR_BODY_US]]: -; CHECK-NEXT: [[V1_021_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] -; CHECK-NEXT: [[V0_020_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] +; CHECK-NEXT: [[V1_019_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] +; CHECK-NEXT: [[V0_018_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] ; CHECK-NEXT: [[BLOCK_017_US:%.*]] = phi i32 [ [[INC9_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0, %[[FOR_BODY_US_PREHEADER]] ] ; CHECK-NEXT: tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]]) ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY3_US_PREHEADER:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_021_US]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_020_US]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_019_US]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_018_US]], i64 0 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI31:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI32:%.*]] = phi <4 x double> [ [[TMP27]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI33:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US1]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US1]], align 4 +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US]], align 4 ; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double> ; CHECK-NEXT: [[TMP5:%.*]] = fpext <4 x float> [[WIDE_LOAD34]] to <4 x double> @@ -223,8 +223,8 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[TMP7:%.*]] = tail call fast <4 x double> @llvm.exp2.v4f64(<4 x double> [[TMP5]]) ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x double> [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <4 x double> [[TMP7]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT36]] -; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT36]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT30]] +; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT30]] ; CHECK-NEXT: [[TMP12:%.*]] = fcmp fast ogt <4 x double> [[TMP10]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast ogt <4 x double> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x double> [[TMP10]], [[TMP10]] @@ -237,26 +237,26 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[TMP21:%.*]] = select ninf <4 x i1> [[TMP13]], <4 x double> [[TMP15]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP22]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP20]] ; CHECK-NEXT: [[TMP23]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI31]], [[TMP21]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP23]], [[TMP22]] ; CHECK-NEXT: [[TMP25:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]]) -; CHECK-NEXT: [[BIN_RDX37:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]] -; CHECK-NEXT: [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX37]]) +; CHECK-NEXT: [[BIN_RDX35:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]] +; CHECK-NEXT: [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX35]]) ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US_PREHEADER]] ; CHECK: [[FOR_BODY3_US_PREHEADER]]: ; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[V1_114_US_PH:%.*]] = phi double [ [[V1_021_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[V0_113_US_PH:%.*]] = phi double [ [[V0_020_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V1_114_US_PH:%.*]] = phi double [ [[V1_019_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V0_113_US_PH:%.*]] = phi double [ [[V0_018_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[FOR_BODY3_US:.*]] ; CHECK: [[FOR_BODY3_US]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ] +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ] ; CHECK-NEXT: [[V1_116_US:%.*]] = phi double [ [[V1_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V1_114_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ] ; CHECK-NEXT: [[V0_115_US:%.*]] = phi double [ [[V0_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V0_113_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US1]], align 4 ; CHECK-NEXT: [[CONV_US:%.*]] = fpext float [[TMP0]] to double ; CHECK-NEXT: [[TMP1:%.*]] = tail call fast double @llvm.exp2.f64(double [[CONV_US]]) ; CHECK-NEXT: [[MUL_US:%.*]] = fmul fast double [[TMP1]], [[Y]] @@ -267,7 +267,7 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[V0_2_US]] = fadd reassoc arcp contract afn double [[V0_115_US]], [[ADD12_US]] ; CHECK-NEXT: [[ADD7_US1:%.*]] = select ninf i1 [[CMP4_US]], double [[ADD7_US]], double -0.000000e+00 ; CHECK-NEXT: [[V1_2_US]] = fadd reassoc arcp contract afn double [[V1_116_US]], [[ADD7_US1]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[EXITCOND25_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND25_NOT]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[FOR_COND1_FOR_INC8_CRIT_EDGE_US]]: @@ -275,17 +275,18 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[V1_2_US_LCSSA]] = phi double [ [[TMP25]], %[[MIDDLE_BLOCK]] ], [ [[V1_2_US]], %[[FOR_BODY3_US]] ] ; CHECK-NEXT: [[INC9_US]] = add nuw nsw i32 [[BLOCK_017_US]], 1 ; CHECK-NEXT: [[EXITCOND26_NOT:%.*]] = icmp eq i32 [[INC9_US]], [[NBLOCKS]] -; CHECK-NEXT: br i1 [[EXITCOND26_NOT]], label %[[FOR_END10]], label %[[FOR_BODY_US]] +; CHECK-NEXT: br i1 [[EXITCOND26_NOT]], label %[[FOR_END10_LOOPEXIT:.*]], label %[[FOR_BODY_US]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[BLOCK_017:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ] ; CHECK-NEXT: tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]]) ; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[BLOCK_017]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[NBLOCKS]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END10]], label %[[FOR_BODY]] +; CHECK: [[FOR_END10_LOOPEXIT]]: +; CHECK-NEXT: [[TMP29:%.*]] = fadd fast double [[V1_2_US_LCSSA]], [[V0_2_US_LCSSA]] +; CHECK-NEXT: br label %[[FOR_END10]] ; CHECK: [[FOR_END10]]: -; CHECK-NEXT: [[V0_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V0_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ] -; CHECK-NEXT: [[V1_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V1_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ] -; CHECK-NEXT: [[ADD11:%.*]] = fadd fast double [[V1_0_LCSSA]], [[V0_0_LCSSA]] +; CHECK-NEXT: [[ADD11:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29]], %[[FOR_END10_LOOPEXIT]] ], [ 0.000000e+00, %[[FOR_BODY]] ] ; CHECK-NEXT: ret double [[ADD11]] ; entry: From 8eacea993408c27287f985f27376501442b4dfc6 Mon Sep 17 00:00:00 2001 From: Andi Drebes <47449897+andidr@users.noreply.github.com> Date: Wed, 5 Nov 2025 10:05:06 +0100 Subject: [PATCH 290/313] [MLIR][ODS] Re-enable direct implementation of type interfaces with method bodies (#166335) Since commit 842622bf8bea782e9d9865ed78b0d8643f098122 adding support for overloading interface methods, a `using` directive is emitted for any interface method that does not require emission of a trait method, including for methods that define a method body. However, methods directly specifying a body (e.g., via the `methodBody` parameter of `InterfaceMethod`) are implemented directly in the interface class and are therefore not present in the associated trait. The generated `using` directive then referes to a non-existent method of the trait, resulting in an error upon compilation of the generated code. This patch changes `DefGen::emitTraitMethods()`, such that `genTraitMethodUsingDecl()` is not invoked for interface methods with a body anymore. --- mlir/test/lib/Dialect/Test/TestTypeDefs.td | 7 +++++++ mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp | 6 ++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td index ea20597231d58..9859bd06cb526 100644 --- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td +++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td @@ -470,4 +470,11 @@ def TestMemrefType : Test_Type<"TestMemref", }]; } +// Test implementation of an interface with methods specifying a +// method body +def TestBaseBody : Test_Type<"TestBaseBody", + [DeclareTypeInterfaceMethods<TestBaseTypeInterfacePrintTypeA>]> { + let mnemonic = "test_base_body"; +} + #endif // TEST_TYPEDEFS diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp index 8ec2e03095423..2a513c3b8cc9b 100644 --- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp +++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp @@ -637,8 +637,10 @@ void DefGen::emitTraitMethods(const InterfaceTrait &trait) { for (auto &method : iface.getMethods()) { // Don't declare if the method has a body. Or if the method has a default // implementation and the def didn't request that it always be declared. - if (method.getBody() || (method.getDefaultImplementation() && - !alwaysDeclared.count(method.getName()))) { + if (method.getBody()) + continue; + if (method.getDefaultImplementation() && + !alwaysDeclared.count(method.getName())) { genTraitMethodUsingDecl(trait, method); continue; } From 7b3fe5fd42f1178f98879642e4ed4bd23a74cfe6 Mon Sep 17 00:00:00 2001 From: David Sherwood <david.sherwood@arm.com> Date: Wed, 5 Nov 2025 09:18:02 +0000 Subject: [PATCH 291/313] [LV][NFC] Remove undef values in some test cases (#164401) Split off from PR #163525, this standalone patch replaces simple cases where undef is used as a value for arithmetic or getelementptr instructions. This will reduce the likelihood of contributors hitting the `undef deprecator` warning in github. --- .../X86/consecutive-ptr-cg-bug.ll | 12 +-- .../LoopVectorize/X86/cost-model-assert.ll | 20 ++--- .../X86/interleaved-accesses-large-gap.ll | 12 +-- .../demanded-bits-of-pointer-instruction.ll | 4 +- .../LoopVectorize/if-pred-stores.ll | 18 ++-- .../Transforms/LoopVectorize/nsw-crash.ll | 4 +- llvm/test/Transforms/LoopVectorize/optsize.ll | 88 +++++++++++-------- llvm/test/Transforms/LoopVectorize/pr32859.ll | 4 +- llvm/test/Transforms/LoopVectorize/pr36311.ll | 13 ++- .../Transforms/LoopVectorize/reduction-ptr.ll | 4 +- .../LoopVectorize/reduction-small-size.ll | 26 ++++-- .../LoopVectorize/runtime-drop-crash.ll | 4 +- .../LoopVectorize/undef-inst-bug.ll | 10 +-- 13 files changed, 121 insertions(+), 98 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll index e6b74062ad765..a33f8eb920039 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll @@ -35,7 +35,7 @@ target triple = "x86_64-unknown-linux-gnu" ; This test was originally vectorized, but now SCEV is smart enough to prove ; that its trip count is 1, so it gets ignored by vectorizer. ; Function Attrs: uwtable -define void @test_01(i1 %arg) { +define void @test_01(ptr addrspace(1) %p, i1 %arg) { br label %.outer ; <label>:1: ; preds = %2 @@ -57,8 +57,8 @@ define void @test_01(i1 %arg) { %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] %9 = add i32 %8, 2 %10 = zext i32 %9 to i64 - %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10 - %12 = ashr i32 undef, %4 + %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10 + %12 = ashr i32 12, %4 store i32 %12, ptr addrspace(1) %11, align 4 %13 = add i32 %7, 1 %14 = icmp sgt i32 %13, 61 @@ -74,7 +74,7 @@ define void @test_01(i1 %arg) { ; CHECK: store <4 x i32> ; Function Attrs: uwtable -define void @test_02(i1 %arg) { +define void @test_02(ptr addrspace(1) %p, i1 %arg) { br label %.outer ; <label>:1: ; preds = %2 @@ -96,8 +96,8 @@ define void @test_02(i1 %arg) { %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] %9 = add i32 %8, 2 %10 = zext i32 %9 to i64 - %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10 - %12 = ashr i32 undef, %4 + %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10 + %12 = ashr i32 12, %4 store i32 %12, ptr addrspace(1) %11, align 4 %13 = add i32 %7, 1 %14 = icmp sgt i32 %13, 610 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll index 4cff8753ba9b1..239366c59470e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll @@ -11,9 +11,9 @@ target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-w64-windows-gnu" -define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 { +define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p, ptr %pend) #0 { ; CHECK-LABEL: define void @cff_index_load_offsets( -; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]], ptr [[PEND:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[EXIT:.*]] ; CHECK: [[IF_THEN]]: @@ -26,14 +26,14 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 { ; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16 ; CHECK-NEXT: [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]] -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[CHAR_TBAA1]] -; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 undef, 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA1]] +; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 12, 8 ; CHECK-NEXT: [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]] ; CHECK-NEXT: [[CONV81:%.*]] = zext i8 [[TMP1]] to i32 ; CHECK-NEXT: [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]] -; CHECK-NEXT: store i32 [[OR83]], ptr undef, align 4, !tbaa [[LONG_TBAA4:![0-9]+]] +; CHECK-NEXT: store i32 [[OR83]], ptr [[P]], align 4, !tbaa [[LONG_TBAA4:![0-9]+]] ; CHECK-NEXT: [[ADD_PTR86]] = getelementptr inbounds i8, ptr [[P_359]], i64 4 -; CHECK-NEXT: [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], undef +; CHECK-NEXT: [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], [[PEND]] ; CHECK-NEXT: br i1 [[CMP66]], label %[[FOR_BODY68]], label %[[SW_EPILOG:.*]] ; CHECK: [[SW_EPILOG]]: ; CHECK-NEXT: unreachable @@ -54,14 +54,14 @@ for.body68: ; preds = %for.body68, %if.the %conv73 = zext i8 %0 to i32 %shl74 = shl nuw nsw i32 %conv73, 16 %or75 = or i32 %shl74, %shl71 - %1 = load i8, ptr undef, align 1, !tbaa !1 - %shl78 = shl nuw nsw i32 undef, 8 + %1 = load i8, ptr %p, align 1, !tbaa !1 + %shl78 = shl nuw nsw i32 12, 8 %or79 = or i32 %or75, %shl78 %conv81 = zext i8 %1 to i32 %or83 = or i32 %or79, %conv81 - store i32 %or83, ptr undef, align 4, !tbaa !4 + store i32 %or83, ptr %p, align 4, !tbaa !4 %add.ptr86 = getelementptr inbounds i8, ptr %p.359, i64 4 - %cmp66 = icmp ult ptr %add.ptr86, undef + %cmp66 = icmp ult ptr %add.ptr86, %pend br i1 %cmp66, label %for.body68, label %sw.epilog sw.epilog: ; preds = %for.body68 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll index e75d469506376..acec9e47a94ee 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll @@ -41,8 +41,8 @@ for.cond.cleanup: ; preds = %for.body ; Make sure interleave groups with a key being the special 'empty' value for ; the map do not cause a crash. -define void @test_gap_empty_key() { -; CHECK-LABEL: @test_gap_empty_key() +define void @test_gap_empty_key(ptr %p) { +; CHECK-LABEL: @test_gap_empty_key(ptr %p) ; CHECK-NEXT: entry: ; CHECK-NEXT: br label %for.body @@ -57,7 +57,7 @@ entry: for.body: %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ] %iv.next = add nsw i64 %iv, 1 - %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next + %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next %G9 = getelementptr i32, ptr %G2, i32 -2147483647 store i32 0, ptr %G2 @@ -71,8 +71,8 @@ exit: ; Make sure interleave groups with a key being the special 'tombstone' value for ; the map do not cause a crash. -define void @test_tombstone_key() { -; CHECK-LABEL: @test_tombstone_key() +define void @test_tombstone_key(ptr %p) { +; CHECK-LABEL: @test_tombstone_key(ptr %p) ; CHECK-NEXT: entry: ; CHECK-NEXT: br label %for.body @@ -87,7 +87,7 @@ entry: for.body: %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ] %iv.next = add nsw i64 %iv, 1 - %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next + %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next %G9 = getelementptr i32, ptr %G2, i32 -2147483648 store i32 0, ptr %G2 diff --git a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll index 41756ffb64e6c..8744e45344242 100644 --- a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll +++ b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll @@ -4,13 +4,13 @@ ; Only make sure we do not crash. ; CHECK: @test -define void @test(ptr %ptr, ptr %ptr_end) { +define void @test(i8 %v, ptr %ptr, ptr %ptr_end) { start: br label %loop loop: %ptr2 = phi ptr [ %ptr3, %loop ], [ %ptr, %start ] - %x = sext i8 undef to i64 + %x = sext i8 %v to i64 %ptr3 = getelementptr inbounds i8, ptr %ptr2, i64 1 %cmp = icmp ult ptr %ptr3, %ptr_end br i1 %cmp, label %loop, label %end diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll index c164c4a46bd94..e7913c583b938 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -384,15 +384,15 @@ for.inc26: ; conditional store to remain scalar. Since we can only type-shrink vector ; types, we shouldn't try to represent the expression in a smaller type. ; -define void @minimal_bit_widths(i1 %c) { +define void @minimal_bit_widths(ptr %p, i1 %c) { ; UNROLL-LABEL: @minimal_bit_widths( ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] ; UNROLL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]] -; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]] +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]] +; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]] ; UNROLL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1 ; UNROLL-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 ; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]] @@ -415,8 +415,8 @@ define void @minimal_bit_widths(i1 %c) { ; UNROLL-NOSIMPLIFY: vector.body: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] @@ -442,16 +442,16 @@ define void @minimal_bit_widths(i1 %c) { ; VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC: vector.body: ; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] -; VEC-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]] +; VEC-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]] ; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1 ; VEC-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.if: ; VEC-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP8]] +; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] ; VEC-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 ; VEC-NEXT: store i8 [[TMP4]], ptr [[TMP3]], align 1 ; VEC-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 -; VEC-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr undef, i64 [[TMP5]] +; VEC-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP5]] ; VEC-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 ; VEC-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] @@ -468,7 +468,7 @@ entry: for.body: %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1000, %entry ] - %tmp2 = getelementptr i8, ptr undef, i64 %tmp0 + %tmp2 = getelementptr i8, ptr %p, i64 %tmp0 %tmp3 = load i8, ptr %tmp2, align 1 br i1 %c, label %if.then, label %for.inc diff --git a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll index 106054d989776..d87d9b155db1c 100644 --- a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -define void @test(i1 %arg) { +define void @test(ptr %p, i1 %arg) { entry: br i1 %arg, label %while.end, label %while.body.lr.ph @@ -11,7 +11,7 @@ while.body.lr.ph: br label %while.body while.body: - %it.sroa.0.091 = phi ptr [ undef, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ] + %it.sroa.0.091 = phi ptr [ %p, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ] %incdec.ptr.i = getelementptr inbounds i32, ptr %it.sroa.0.091, i64 1 %inc32 = add i32 undef, 1 ; <------------- Make sure we don't set NSW flags to the undef. %cmp.i11 = icmp eq ptr %incdec.ptr.i, undef diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll index 763072ab16f73..f9f7feb7bdfbc 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -248,25 +248,27 @@ for.end: ; preds = %for.body ; @cm_array = external global [2592 x i16], align 1 -define void @pr43371() optsize { +define void @pr43371(i16 %val) optsize { ; ; CHECK-LABEL: define void @pr43371( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; CHECK-NEXT: store i16 0, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 +; CHECK-NEXT: store i16 0, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -277,22 +279,24 @@ define void @pr43371() optsize { ; CHECK-NEXT: unreachable ; ; PGSO-LABEL: define void @pr43371( -; PGSO-SAME: ) #[[ATTR0]] { +; PGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] { ; PGSO-NEXT: [[ENTRY:.*:]] ; PGSO-NEXT: br label %[[VECTOR_PH:.*]] ; PGSO: [[VECTOR_PH]]: +; PGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; PGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; PGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; PGSO: [[VECTOR_BODY]]: ; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PGSO-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; PGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; PGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; PGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 +; PGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 +; PGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 ; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; PGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; PGSO-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -303,22 +307,24 @@ define void @pr43371() optsize { ; PGSO-NEXT: unreachable ; ; NPGSO-LABEL: define void @pr43371( -; NPGSO-SAME: ) #[[ATTR0]] { +; NPGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] { ; NPGSO-NEXT: [[ENTRY:.*:]] ; NPGSO-NEXT: br label %[[VECTOR_PH:.*]] ; NPGSO: [[VECTOR_PH]]: +; NPGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; NPGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; NPGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; NPGSO: [[VECTOR_BODY]]: ; NPGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; NPGSO-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NPGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; NPGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; NPGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; NPGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; NPGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; NPGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; NPGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; NPGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; NPGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; NPGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 +; NPGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; NPGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 +; NPGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 ; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; NPGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; NPGSO-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -340,7 +346,7 @@ for.cond.cleanup28: for.body29: %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] - %add33 = add i16 undef, %i24.0170 + %add33 = add i16 %val, %i24.0170 %idxprom34 = zext i16 %add33 to i32 %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34 store i16 0, ptr %arrayidx35, align 1 @@ -349,25 +355,27 @@ for.body29: br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 } -define void @pr43371_pgso() !prof !14 { +define void @pr43371_pgso(i16 %val) !prof !14 { ; ; CHECK-LABEL: define void @pr43371_pgso( -; CHECK-SAME: ) !prof [[PROF14]] { +; CHECK-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; CHECK-NEXT: store i16 0, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 +; CHECK-NEXT: store i16 0, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -378,22 +386,24 @@ define void @pr43371_pgso() !prof !14 { ; CHECK-NEXT: unreachable ; ; PGSO-LABEL: define void @pr43371_pgso( -; PGSO-SAME: ) !prof [[PROF14]] { +; PGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] { ; PGSO-NEXT: [[ENTRY:.*:]] ; PGSO-NEXT: br label %[[VECTOR_PH:.*]] ; PGSO: [[VECTOR_PH]]: +; PGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; PGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; PGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; PGSO: [[VECTOR_BODY]]: ; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PGSO-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; PGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; PGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; PGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 +; PGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 +; PGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 ; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; PGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; PGSO-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -404,17 +414,19 @@ define void @pr43371_pgso() !prof !14 { ; PGSO-NEXT: unreachable ; ; NPGSO-LABEL: define void @pr43371_pgso( -; NPGSO-SAME: ) !prof [[PROF14]] { +; NPGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] { ; NPGSO-NEXT: [[ENTRY:.*:]] ; NPGSO-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] ; NPGSO: [[VECTOR_SCEVCHECK]]: -; NPGSO-NEXT: br i1 undef, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NPGSO-NEXT: [[TMP0:%.*]] = add i16 [[VAL]], 755 +; NPGSO-NEXT: [[TMP4:%.*]] = icmp ult i16 [[TMP0]], [[VAL]] +; NPGSO-NEXT: br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; NPGSO: [[VECTOR_PH]]: ; NPGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; NPGSO: [[VECTOR_BODY]]: ; NPGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; NPGSO-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; NPGSO-NEXT: [[TMP1:%.*]] = add i16 undef, [[OFFSET_IDX]] +; NPGSO-NEXT: [[TMP1:%.*]] = add i16 [[VAL]], [[OFFSET_IDX]] ; NPGSO-NEXT: [[TMP2:%.*]] = zext i16 [[TMP1]] to i32 ; NPGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; NPGSO-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP3]], align 1 @@ -429,7 +441,7 @@ define void @pr43371_pgso() !prof !14 { ; NPGSO-NEXT: unreachable ; NPGSO: [[FOR_BODY29]]: ; NPGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] -; NPGSO-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] +; NPGSO-NEXT: [[ADD33:%.*]] = add i16 [[VAL]], [[I24_0170]] ; NPGSO-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 ; NPGSO-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]] ; NPGSO-NEXT: store i16 0, ptr [[ARRAYIDX35]], align 1 @@ -449,7 +461,7 @@ for.cond.cleanup28: for.body29: %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] - %add33 = add i16 undef, %i24.0170 + %add33 = add i16 %val, %i24.0170 %idxprom34 = zext i16 %add33 to i32 %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34 store i16 0, ptr %arrayidx35, align 1 diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll index 2d30e0c9ad10f..f65e9cab1700b 100644 --- a/llvm/test/Transforms/LoopVectorize/pr32859.ll +++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll @@ -10,13 +10,13 @@ ; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ] ; Function Attrs: nounwind uwtable -define void @main(i32 %n) #0 { +define void @main(i32 %n, i32 %v) #0 { entry: br label %for.cond1.preheader.i for.cond1.preheader.i: ; preds = %if.end.2.i, %entry %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ] - %tobool.i = icmp ne i32 undef, 0 + %tobool.i = icmp ne i32 %v, 0 br label %if.end.2.i if.end.2.i: ; preds = %for.cond1.preheader.i diff --git a/llvm/test/Transforms/LoopVectorize/pr36311.ll b/llvm/test/Transforms/LoopVectorize/pr36311.ll index f2dfecc341e6f..bc27e4e9b09cd 100644 --- a/llvm/test/Transforms/LoopVectorize/pr36311.ll +++ b/llvm/test/Transforms/LoopVectorize/pr36311.ll @@ -10,10 +10,7 @@ $test = comdat any -declare i32 @__gxx_personality_v0(...) - -; Function Attrs: uwtable -define dso_local void @test(i1 %arg) local_unnamed_addr #0 comdat align 2 personality ptr @__gxx_personality_v0 { +define void @test(ptr %p, i1 %arg) { entry: br label %for.body51 @@ -26,9 +23,9 @@ for.cond80.loopexit: ; preds = %for.body89 for.body89.lr.ph: ; preds = %for.cond80.loopexit, %for.body51 %i79.0179 = phi i32 [ %add90, %for.cond80.loopexit ], [ 0, %for.body51 ] - %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ undef, %for.body51 ] + %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ 0, %for.body51 ] %add90 = add nuw i32 %i79.0179, 1 - %mul91 = mul i32 %add90, undef + %mul91 = mul i32 %add90, 7 br label %for.body89 for.body89: ; preds = %for.body89, %for.body89.lr.ph @@ -38,10 +35,10 @@ for.body89: ; preds = %for.body89, %for.bo %add93 = add i32 %add92, %mul91 %inc94 = add i32 %next_index.5174, 1 %conv95 = zext i32 %next_index.5174 to i64 - %arrayidx.i160 = getelementptr inbounds i32, ptr undef, i64 %conv95 + %arrayidx.i160 = getelementptr inbounds i32, ptr %p, i64 %conv95 store i32 %add93, ptr %arrayidx.i160, align 4 ;, !tbaa !1 - %cmp87 = icmp ult i32 %add92, undef + %cmp87 = icmp ult i32 %add92, 123 br i1 %cmp87, label %for.body89, label %for.cond80.loopexit nrvo.skipdtor.loopexit: ; preds = %for.cond80.loopexit diff --git a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll index 0656cd2b2aa94..0fdc8fd6ad519 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll @@ -15,7 +15,7 @@ define void @PR49215(ptr %p, ptr %q) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult ptr [[Q:%.*]], [[G]] ; CHECK-NEXT: [[UMIN]] = select i1 [[CMP2]], ptr [[Q]], ptr [[G]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], undef +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 123 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]] ; CHECK: loopexit: ; CHECK-NEXT: [[UMIN_LCSSA:%.*]] = phi ptr [ [[UMIN]], [[FOR_BODY]] ] @@ -31,7 +31,7 @@ for.body: %cmp2 = icmp ult ptr %q, %g %umin = select i1 %cmp2, ptr %q, ptr %g %iv.next = add nuw nsw i64 %iv, 1 - %exitcond = icmp eq i64 %iv.next, undef + %exitcond = icmp eq i64 %iv.next, 123 br i1 %exitcond, label %loopexit, label %for.body loopexit: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll index 13cc1b657d231..f01e562fe40c7 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll @@ -3,7 +3,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define i8 @PR34687(i1 %c, i32 %x, i32 %n) { +define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) { ; CHECK-LABEL: @PR34687( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4 @@ -13,20 +13,30 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[X1:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[VEC_IND]], [[TMP0]] +; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255) -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT4]] ; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8> ; CHECK-NEXT: [[TMP4]] = zext <4 x i8> [[TMP3]] to <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]]) ; CHECK-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 +; CHECK-NEXT: [[PREDPHI:%.*]] = extractelement <4 x i32> [[PREDPHI1]], i32 3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -36,17 +46,19 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) { ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[R_NEXT:%.*]], [[IF_END]] ] -; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END]] +; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] ; CHECK: if.then: -; CHECK-NEXT: [[T0:%.*]] = sdiv i32 undef, undef +; CHECK-NEXT: [[T0:%.*]] = sdiv i32 [[I]], [[X]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: +; CHECK-NEXT: [[DIV_PHI:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[T0]], [[IF_THEN]] ] ; CHECK-NEXT: [[T1:%.*]] = and i32 [[R]], 255 ; CHECK-NEXT: [[I_NEXT]] = add nsw i32 [[I]], 1 -; CHECK-NEXT: [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X]] +; CHECK-NEXT: [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X1]] ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: +; CHECK-NEXT: [[DIV_USE:%.*]] = phi i32 [ [[DIV_PHI]], [[IF_END]] ], [ [[PREDPHI]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[T2:%.*]] = phi i32 [ [[R_NEXT]], [[IF_END]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i8 ; CHECK-NEXT: ret i8 [[T3]] @@ -60,10 +72,11 @@ for.body: br i1 %c, label %if.then, label %if.end if.then: - %t0 = sdiv i32 undef, undef + %t0 = sdiv i32 %i, %divisor br label %if.end if.end: + %div_phi = phi i32 [ 0, %for.body ], [ %t0, %if.then ] %t1 = and i32 %r, 255 %i.next = add nsw i32 %i, 1 %r.next = add nuw nsw i32 %t1, %x @@ -71,6 +84,7 @@ if.end: br i1 %cond, label %for.end, label %for.body for.end: + %div_use = phi i32 [ %div_phi, %if.end ] %t2 = phi i32 [ %r.next, %if.end ] %t3 = trunc i32 %t2 to i8 ret i8 %t3 diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll index c76c2c0ef47a2..ab10d62bc6048 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll @@ -12,12 +12,12 @@ entry: loop: %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ] - %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr, i64 undef + %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr store i64 0, ptr %tmp4, align 8 %tmp8 = add i64 1, %tmp3 %tmp10 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp8 store i64 1, ptr %tmp10, align 8 - %tmp14 = add i64 undef, %tmp3 + %tmp14 = add i64 3, %tmp3 %tmp16 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp14 store i64 2, ptr %tmp16, align 8 %tmp18 = add nuw nsw i64 %tmp3, 4 diff --git a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll index 1fccf546f4a67..d3cd80beaae90 100644 --- a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll +++ b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll @@ -14,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-LABEL: @t( ; CHECK: <4 x i32> -define void @t() { +define void @t(ptr %p) { entry: br label %for.body @@ -22,13 +22,13 @@ for.body: %indvars.iv17 = phi i64 [ %indvars.next, %for.body ], [ 128, %entry ] ; Loop invariant anchored in loop. - %idxprom21 = zext i32 undef to i64 + %idxprom21 = zext i32 0 to i64 - %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr undef, i64 0, i64 %idxprom21, i64 %indvars.iv17 - store i32 undef, ptr %arrayidx23, align 4 + %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr %p, i64 0, i64 %idxprom21, i64 %indvars.iv17 + store i32 poison, ptr %arrayidx23, align 4 %indvars.next= add i64 %indvars.iv17, -1 %0 = trunc i64 %indvars.next to i32 - %cmp15 = icmp ugt i32 %0, undef + %cmp15 = icmp ugt i32 %0, poison br i1 %cmp15, label %for.body, label %loopexit loopexit: From 0314b939d69597e16e08bb31587fa53c1bea75cb Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <llvm-dev@redking.me.uk> Date: Wed, 5 Nov 2025 09:56:20 +0000 Subject: [PATCH 292/313] [Headers][X86] avx ifma - move constexpr to the end of the function attribute lists. NFC. (#166523) Makes it easier to compare constexpr/non-constexpr attribute defines Allows clang-format to pack the attributes more efficiently --- clang/lib/Headers/avx512ifmaintrin.h | 5 ++--- clang/lib/Headers/avx512ifmavlintrin.h | 13 ++++++------- clang/lib/Headers/avxifmaintrin.h | 8 ++++---- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h index 625a8ff66dc60..f73b607df797f 100644 --- a/clang/lib/Headers/avx512ifmaintrin.h +++ b/clang/lib/Headers/avx512ifmaintrin.h @@ -17,9 +17,8 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS \ - constexpr \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \ - __min_vector_width__(512))) + __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \ + __min_vector_width__(512))) constexpr #else #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \ diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h index b377c17166ffb..51d5210e5aa5d 100644 --- a/clang/lib/Headers/avx512ifmavlintrin.h +++ b/clang/lib/Headers/avx512ifmavlintrin.h @@ -18,13 +18,13 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS128 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512ifma,avx512vl"), \ - __min_vector_width__(128))) + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512ifma,avx512vl"), \ + __min_vector_width__(128))) constexpr #define __DEFAULT_FN_ATTRS256 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512ifma,avx512vl"), \ - __min_vector_width__(256))) + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512ifma,avx512vl"), \ + __min_vector_width__(256))) constexpr #else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ @@ -34,7 +34,6 @@ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512ifma,avx512vl"), \ __min_vector_width__(256))) - #endif #if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__)) diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h index e452d5f0920e9..30df01caed6cf 100644 --- a/clang/lib/Headers/avxifmaintrin.h +++ b/clang/lib/Headers/avxifmaintrin.h @@ -17,11 +17,11 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS128 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avxifma"), __min_vector_width__(128))) + __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ + __min_vector_width__(128))) constexpr #define __DEFAULT_FN_ATTRS256 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avxifma"), __min_vector_width__(256))) + __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ + __min_vector_width__(256))) constexpr #else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ From cc9ad9afc6d22f9955a35aa8f62f2bbf11109673 Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki <pnagato@protonmail.com> Date: Wed, 5 Nov 2025 01:58:30 -0800 Subject: [PATCH 293/313] [Clang] Add constexpr support for AVX512 permutex2 intrinsics (#165085) This patch enables compile-time evaluation of AVX512 permutex2var intrinsics in constexpr contexts. Extend shuffle generic to handle both integer immediate and vector mask operands. Resolves #161335 --- clang/include/clang/Basic/BuiltinsX86.td | 61 ++----- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 88 +++++++++- clang/lib/AST/ExprConstant.cpp | 111 ++++++++++++- clang/lib/Headers/avx10_2_512bf16intrin.h | 9 +- clang/lib/Headers/avx10_2bf16intrin.h | 15 +- clang/lib/Headers/avx512bwintrin.h | 20 +-- clang/lib/Headers/avx512fintrin.h | 85 ++++------ clang/lib/Headers/avx512fp16intrin.h | 4 +- clang/lib/Headers/avx512vbmiintrin.h | 48 +++--- clang/lib/Headers/avx512vbmivlintrin.h | 86 +++++----- clang/lib/Headers/avx512vlbwintrin.h | 44 ++--- clang/lib/Headers/avx512vlfp16intrin.h | 8 +- clang/lib/Headers/avx512vlintrin.h | 77 ++++----- clang/test/CodeGen/X86/avx512bw-builtins.c | 153 +++++++++++++++++ clang/test/CodeGen/X86/avx512f-builtins.c | 120 ++++++++++++++ clang/test/CodeGen/X86/avx512vbmi-builtins.c | 154 ++++++++++++++++++ clang/test/CodeGen/X86/avx512vbmivl-builtin.c | 70 +++++++- clang/test/CodeGen/X86/avx512vl-builtins.c | 128 +++++++++++++++ clang/test/CodeGen/X86/avx512vlbw-builtins.c | 80 +++++++++ 19 files changed, 1096 insertions(+), 265 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 9e877b92eac68..4388c09423a21 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -1765,75 +1765,48 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in { def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; -} - -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">; -} - -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">; -} - -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">; + def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">; + def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">; + def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">; + def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { + def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">; + def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">; + def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">; } -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">; } -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">; } -let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 8b57b963c538f..9991e365addb8 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3415,18 +3415,46 @@ static bool interp__builtin_ia32_shuffle_generic( GetSourceIndex) { assert(Call->getNumArgs() == 3); - unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue(); + + unsigned ShuffleMask = 0; + Pointer A, MaskVector, B; + + QualType Arg2Type = Call->getArg(2)->getType(); + bool IsVectorMask = false; + if (Arg2Type->isVectorType()) { + IsVectorMask = true; + B = S.Stk.pop<Pointer>(); + MaskVector = S.Stk.pop<Pointer>(); + A = S.Stk.pop<Pointer>(); + } else if (Arg2Type->isIntegerType()) { + ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue(); + B = S.Stk.pop<Pointer>(); + A = S.Stk.pop<Pointer>(); + } else { + return false; + } QualType Arg0Type = Call->getArg(0)->getType(); const auto *VecT = Arg0Type->castAs<VectorType>(); PrimType ElemT = *S.getContext().classify(VecT->getElementType()); unsigned NumElems = VecT->getNumElements(); - const Pointer &B = S.Stk.pop<Pointer>(); - const Pointer &A = S.Stk.pop<Pointer>(); const Pointer &Dst = S.Stk.peek<Pointer>(); + PrimType MaskElemT = PT_Uint32; + if (IsVectorMask) { + QualType Arg1Type = Call->getArg(1)->getType(); + const auto *MaskVecT = Arg1Type->castAs<VectorType>(); + QualType MaskElemType = MaskVecT->getElementType(); + MaskElemT = *S.getContext().classify(MaskElemType); + } + for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) { + if (IsVectorMask) { + INT_TYPE_SWITCH(MaskElemT, { + ShuffleMask = static_cast<unsigned>(MaskVector.elem<T>(DstIdx)); + }); + } auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); if (SrcIdx < 0) { @@ -4434,6 +4462,60 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return std::pair<unsigned, int>{0, static_cast<int>(DstIdx)}; } }); + case X86::BI__builtin_ia32_vpermi2varq128: + case X86::BI__builtin_ia32_vpermi2varpd128: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x1; + unsigned SrcIdx = (ShuffleMask >> 1) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_vpermi2vard128: + case X86::BI__builtin_ia32_vpermi2varps128: + case X86::BI__builtin_ia32_vpermi2varq256: + case X86::BI__builtin_ia32_vpermi2varpd256: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3; + unsigned SrcIdx = (ShuffleMask >> 2) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_vpermi2varhi128: + case X86::BI__builtin_ia32_vpermi2vard256: + case X86::BI__builtin_ia32_vpermi2varps256: + case X86::BI__builtin_ia32_vpermi2varq512: + case X86::BI__builtin_ia32_vpermi2varpd512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x7; + unsigned SrcIdx = (ShuffleMask >> 3) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_vpermi2varqi128: + case X86::BI__builtin_ia32_vpermi2varhi256: + case X86::BI__builtin_ia32_vpermi2vard512: + case X86::BI__builtin_ia32_vpermi2varps512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0xF; + unsigned SrcIdx = (ShuffleMask >> 4) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_vpermi2varqi256: + case X86::BI__builtin_ia32_vpermi2varhi512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x1F; + unsigned SrcIdx = (ShuffleMask >> 5) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_vpermi2varqi512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3F; + unsigned SrcIdx = (ShuffleMask >> 6) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + }); case X86::BI__builtin_ia32_pshufb128: case X86::BI__builtin_ia32_pshufb256: case X86::BI__builtin_ia32_pshufb512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 97eeba8b9d6cc..8fab6efafb983 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11628,21 +11628,38 @@ static bool evalShuffleGeneric( if (!VT) return false; - APSInt MaskImm; - if (!EvaluateInteger(Call->getArg(2), MaskImm, Info)) - return false; - unsigned ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue()); + unsigned ShuffleMask = 0; + APValue A, MaskVector, B; + bool IsVectorMask = false; - APValue A, B; - if (!EvaluateAsRValue(Info, Call->getArg(0), A) || - !EvaluateAsRValue(Info, Call->getArg(1), B)) + QualType Arg2Type = Call->getArg(2)->getType(); + if (Arg2Type->isVectorType()) { + IsVectorMask = true; + if (!EvaluateAsRValue(Info, Call->getArg(0), A) || + !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) || + !EvaluateAsRValue(Info, Call->getArg(2), B)) + return false; + } else if (Arg2Type->isIntegerType()) { + APSInt MaskImm; + if (!EvaluateInteger(Call->getArg(2), MaskImm, Info)) + return false; + ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue()); + if (!EvaluateAsRValue(Info, Call->getArg(0), A) || + !EvaluateAsRValue(Info, Call->getArg(1), B)) + return false; + } else { return false; + } unsigned NumElts = VT->getNumElements(); - SmallVector<APValue, 16> ResultElements; + SmallVector<APValue, 64> ResultElements; ResultElements.reserve(NumElts); for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) { + if (IsVectorMask) { + ShuffleMask = static_cast<unsigned>( + MaskVector.getVectorElt(DstIdx).getInt().getZExtValue()); + } auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); if (SrcIdx < 0) { @@ -13080,6 +13097,84 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_vpermi2varq128: + case X86::BI__builtin_ia32_vpermi2varpd128: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x1; + unsigned SrcIdx = (ShuffleMask >> 1) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2vard128: + case X86::BI__builtin_ia32_vpermi2varps128: + case X86::BI__builtin_ia32_vpermi2varq256: + case X86::BI__builtin_ia32_vpermi2varpd256: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3; + unsigned SrcIdx = (ShuffleMask >> 2) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varhi128: + case X86::BI__builtin_ia32_vpermi2vard256: + case X86::BI__builtin_ia32_vpermi2varps256: + case X86::BI__builtin_ia32_vpermi2varq512: + case X86::BI__builtin_ia32_vpermi2varpd512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x7; + unsigned SrcIdx = (ShuffleMask >> 3) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varqi128: + case X86::BI__builtin_ia32_vpermi2varhi256: + case X86::BI__builtin_ia32_vpermi2vard512: + case X86::BI__builtin_ia32_vpermi2varps512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0xF; + unsigned SrcIdx = (ShuffleMask >> 4) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varqi256: + case X86::BI__builtin_ia32_vpermi2varhi512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x1F; + unsigned SrcIdx = (ShuffleMask >> 5) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varqi512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3F; + unsigned SrcIdx = (ShuffleMask >> 6) & 0x1; + return std::pair<unsigned, int>{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } } } diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h index 37ebc4f46a826..46ec12a63ef9c 100644 --- a/clang/lib/Headers/avx10_2_512bf16intrin.h +++ b/clang/lib/Headers/avx10_2_512bf16intrin.h @@ -24,6 +24,12 @@ typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1))); __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \ __min_vector_width__(512))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr +#else +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 +#endif + static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) { return __builtin_bit_cast(__m512bh, _mm512_setzero_ps()); } @@ -167,7 +173,7 @@ _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) { (__v32bf)__A); } -static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) { return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); @@ -555,6 +561,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh( (__v32bf)_mm512_setzero_pbh()); } +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR #undef __DEFAULT_FN_ATTRS512 #endif diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h index 765cd682986b4..8fb8cd7cd0865 100644 --- a/clang/lib/Headers/avx10_2bf16intrin.h +++ b/clang/lib/Headers/avx10_2bf16intrin.h @@ -27,6 +27,14 @@ typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1))); __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \ __min_vector_width__(128))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) { return __builtin_bit_cast(__m256bh, _mm256_setzero_ps()); } @@ -287,13 +295,13 @@ _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) { (__v16bf)__A); } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) { return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi)__B); } -static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) { return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); @@ -1080,6 +1088,7 @@ _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 - +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif #endif diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index ac75b6ccde735..aab1f2b61ab8a 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -969,35 +969,31 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)_mm512_setzero_si512()); diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 18c4a44a4c76e..5fc0afa49ce4c 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -3059,69 +3059,61 @@ _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) /* Vector permutations */ -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, (__v16si) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, (__v8di) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)_mm512_setzero_si512()); @@ -5949,71 +5941,66 @@ _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) (__v16sf)_mm512_setzero_ps()); } -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) { return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, (__v8df)__B); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)__A); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, - __m512d __B) -{ + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)(__m512d)__I); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, - __m512d __B) -{ + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)_mm512_setzero_pd()); } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) { return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, (__v16sf) __B); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)__A); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)(__m512)__I); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)_mm512_setzero_ps()); } - #define _mm512_cvtt_roundpd_epu32(A, R) \ ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_undefined_si256(), \ diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h index 142cc079c2c4b..25051228f3e0a 100644 --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -3316,13 +3316,13 @@ _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { (__v32hf)__A); } -static __inline__ __m512h __DEFAULT_FN_ATTRS512 +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) { return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); } -static __inline__ __m512h __DEFAULT_FN_ATTRS512 +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutexvar_ph(__m512i __A, __m512h __B) { return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); } diff --git a/clang/lib/Headers/avx512vbmiintrin.h b/clang/lib/Headers/avx512vbmiintrin.h index 964535c4c4900..84fda5c5849e8 100644 --- a/clang/lib/Headers/avx512vbmiintrin.h +++ b/clang/lib/Headers/avx512vbmiintrin.h @@ -19,59 +19,57 @@ __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), \ __min_vector_width__(512))) -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) -{ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I, (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutexvar_epi8 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_permutexvar_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)__W); @@ -99,8 +97,6 @@ _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), (__v64qi)_mm512_setzero_si512()); } - - +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #undef __DEFAULT_FN_ATTRS - #endif diff --git a/clang/lib/Headers/avx512vbmivlintrin.h b/clang/lib/Headers/avx512vbmivlintrin.h index 4c50be7d9e7e5..58a48dadff863 100644 --- a/clang/lib/Headers/avx512vbmivlintrin.h +++ b/clang/lib/Headers/avx512vbmivlintrin.h @@ -24,117 +24,110 @@ __target__("avx512vbmi,avx512vl"), \ __min_vector_width__(256))) -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) -{ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A, (__v16qi)__I, (__v16qi)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I, (__v32qi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutexvar_epi8 (__m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutexvar_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi8 (__m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutexvar_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)__W); @@ -186,7 +179,8 @@ _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) (__v32qi)_mm256_setzero_si256()); } - +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 263a1079b26d5..575c0c8962662 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -1223,69 +1223,61 @@ _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi) __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I, + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)_mm256_setzero_si256()); diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h index 5b2b3f0d0bbd4..885231b030b23 100644 --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -2010,24 +2010,24 @@ _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { (__v16hf)__A); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi)__B); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutexvar_ph(__m128i __A, __m128h __B) { return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutexvar_ph(__m256i __A, __m256h __B) { return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); } diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 92bb444aeb5b8..e5249926b934e 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -3556,13 +3556,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I, (__v4si)__B); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3570,7 +3570,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)__A); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3578,7 +3578,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)__I); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3586,13 +3586,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)_mm_setzero_si128()); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I, (__v8si) __B); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3600,7 +3600,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)__A); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3608,7 +3608,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)__I); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3616,40 +3616,43 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)_mm256_setzero_si256()); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I, (__v2df)__B); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)__A); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)(__m128d)__I); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)_mm_setzero_pd()); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I, (__v4df)__B); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3657,7 +3660,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)__A); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3665,7 +3668,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)(__m256d)__I); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3673,47 +3676,48 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)_mm256_setzero_pd()); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I, (__v4sf)__B); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)__A); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)(__m128)__I); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)_mm_setzero_ps()); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I, (__v8sf) __B); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR + _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, + __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), (__v8sf)__A); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U, __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, @@ -3721,7 +3725,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)(__m256)__I); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, @@ -3729,13 +3733,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I, (__v2di)__B); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3743,7 +3747,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)__A); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3751,7 +3755,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)__I); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3759,14 +3763,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)_mm_setzero_si128()); } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I, (__v4di) __B); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, @@ -3774,7 +3777,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4di)__A); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, @@ -3782,7 +3785,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4di)__I); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index be2cd480f7558..e6e2e38bcc097 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -1591,6 +1591,132 @@ __m512i test_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i return _mm512_maskz_permutex2var_epi16(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v32hi( + _mm512_permutex2var_epi16( + (__m512i)(__v32hi){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150, + 160, 170, 180, 190, 200, 210, 220, 230, + 240, 250, 260, 270, 280, 290, 300, 310}, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + 0, 400, 10, 410, 20, 420, 30, 430, + 40, 440, 50, 450, 60, 460, 70, 470, + 80, 480, 90, 490, 100, 500, 110, 510, + 120, 520, 130, 530, 140, 540, 150, 550)); +TEST_CONSTEXPR(match_v32hi( + _mm512_mask_permutex2var_epi16( + (__m512i)(__v32hi){ + -1, -2, -3, -4, -5, -6, -7, -8, + -9, -10, -11, -12, -13, -14, -15, -16, + -17, -18, -19, -20, -21, -22, -23, -24, + -25, -26, -27, -28, -29, -30, -31, -32}, + 0xAAAAAAAA, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + -1, 400, -3, 410, -5, 420, -7, 430, + -9, 440, -11, 450, -13, 460, -15, 470, + -17, 480, -19, 490, -21, 500, -23, 510, + -25, 520, -27, 530, -29, 540, -31, 550)); +TEST_CONSTEXPR(match_v32hi( + _mm512_maskz_permutex2var_epi16( + 0x55555555, + (__m512i)(__v32hi){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150, + 160, 170, 180, 190, 200, 210, 220, 230, + 240, 250, 260, 270, 280, 290, 300, 310}, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + 0, 0, 10, 0, 20, 0, 30, 0, + 40, 0, 50, 0, 60, 0, 70, 0, + 80, 0, 90, 0, 100, 0, 110, 0, + 120, 0, 130, 0, 140, 0, 150, 0)); + +TEST_CONSTEXPR(match_v64qu( + _mm512_permutex2var_epi8( + (__m512i)(__v64qu){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 127, 126, 125, + 124, 123, 122, 121, 120, 119, 118, 117, + 116, 115, 114, 113, 112, 111, 110, 109, + 108, 107, 106, 105, 104, 103, 102, 101, + 100, 99, 98, 97, 96, 95, 94, 93, + 92, 91, 90, 89, 88, 87, 86, 85, + 84, 83, 82, 81, 80, 79, 78, 77}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 210, 220, 230, 240, 250, 254, 253, + 252, 251, 250, 249, 248, 247, 246, 245, + 244, 243, 242, 241, 240, 239, 238, 237, + 236, 235, 234, 233, 232, 231, 230, 229, + 228, 227, 226, 225, 224, 223, 222, 221, + 220, 219, 218, 217, 216, 215, 214, 213, + 212, 211, 210, 209, 208, 207, 206, 205, + 204, 203, 202, 201, 200, 199, 198, 197}), + 0, 200, 10, 210, 20, 220, 30, 230, + 40, 240, 50, 250, 60, 254, 70, 253, + 80, 252, 90, 251, 100, 250, 110, 249, + 120, 248, 127, 247, 126, 246, 125, 245, + 124, 244, 123, 243, 122, 242, 121, 241, + 120, 240, 119, 239, 118, 238, 117, 237, + 116, 236, 115, 235, 114, 234, 113, 233, + 112, 232, 111, 231, 110, 230, 109, 229)); +TEST_CONSTEXPR(match_v32hi( + _mm512_mask2_permutex2var_epi16( + (__m512i)(__v32hi){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150, + 160, 170, 180, 190, 200, 210, 220, 230, + 240, 250, 260, 270, 280, 290, 300, 310}, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + 0x55555555, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + 0, 32, 10, 33, 20, 34, 30, 35, + 40, 36, 50, 37, 60, 38, 70, 39, + 80, 40, 90, 41, 100, 42, 110, 43, + 120, 44, 130, 45, 140, 46, 150, 47)); + __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mulhrs_epi16 // CHECK: @llvm.x86.avx512.pmul.hr.sw.512 @@ -2578,6 +2704,33 @@ __m512i test_mm512_broadcastw_epi16(__m128i __A) { return _mm512_broadcastw_epi16(__A); } TEST_CONSTEXPR(match_v32hi(_mm512_broadcastw_epi16((__m128i)(__v8hi){42, 3, 10, 8, 0, 256, 256, 128}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42)); +TEST_CONSTEXPR(match_v32hi( + _mm512_permutex2var_epi16((__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34, + 3, 35, 4, 36, 5, 37, 6, 38, + 7, 39, 8, 40, 9, 41, 10, 42, + 11, 43, 12, 44, 13, 45, 14, 46}, + (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}), + 1, 32, 101, 132, 2, 102, 3, 103, + 4, 104, 5, 105, 6, 106, 7, 107, + 8, 108, 9, 109, 10, 110, 11, 111, + 12, 112, 13, 113, 14, 114, 15, 115)); +TEST_CONSTEXPR(match_v32hi( + _mm512_mask_permutex2var_epi16((__m512i)(__v32hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, + -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32}, + 0xAAAAAAAA, + (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34, + 3, 35, 4, 36, 5, 37, 6, 38, + 7, 39, 8, 40, 9, 41, 10, 42, + 11, 43, 12, 44, 13, 45, 14, 46}, + (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}), + -1, -32, -3, 132, -5, 102, -7, 103, + -9, 104, -11, 105, -13, 106, -15, 107, + -17, 108, -19, 109, -21, 110, -23, 111, + -25, 112, -27, 113, -29, 114, -31, 115)); __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) { // CHECK-LABEL: test_mm512_mask_broadcastw_epi16 diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 69599379b6b3d..8e65430bd3e84 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -5607,6 +5607,56 @@ __m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i _ // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B); } + +TEST_CONSTEXPR(match_v16si( + _mm512_permutex2var_epi32( + (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 150, 200, 350, 10, 210, 20, 220, + 30, 230, 40, 240, 50, 250, 60, 260)); +TEST_CONSTEXPR(match_v16si( + _mm512_mask_permutex2var_epi32( + (__m512i)(__v16si){-1, -2, -3, -4, -5, -6, -7, -8, + -9, -10, -11, -12, -13, -14, -15, -16}, + 0xAAAA, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + -1, -16, -3, 350, -5, 210, -7, 220, + -9, 230, -11, 240, -13, 250, -15, 260)); +TEST_CONSTEXPR(match_v16si( + _mm512_maskz_permutex2var_epi32( + 0x5555, + (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 0, 200, 0, 10, 0, 20, 0, + 30, 0, 40, 0, 50, 0, 60, 0)); +TEST_CONSTEXPR(match_m512( + _mm512_permutex2var_ps( + (__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f, + 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}), + 1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f, + 4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f)); +TEST_CONSTEXPR(match_m512d( + _mm512_permutex2var_pd( + (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + 1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0)); + __mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_testn_epi32_mask // CHECK: and <16 x i32> %{{.*}}, %{{.*}} @@ -11753,3 +11803,73 @@ void test_mm512_mask_i32loscatter_epi64(void *__addr, __mmask8 __mask, __m512i _ // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512 _mm512_mask_i32loscatter_epi64(__addr, __mask, __index, __v1, 2); } + + +TEST_CONSTEXPR(match_m512d( + _mm512_permutex2var_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + 1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0)); +TEST_CONSTEXPR(match_m512d( + _mm512_mask_permutex2var_pd((__m512d){-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0}, + 0xAA, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + -1.0, 108.0, -3.0, -8.0, -5.0, -2.0, -7.0, -3.0)); +TEST_CONSTEXPR(match_m512d( + _mm512_maskz_permutex2var_pd(0x55, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0)); + +TEST_CONSTEXPR(match_m512( + _mm512_permutex2var_ps((__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f, + 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}), + 1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f, + 4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f)); +TEST_CONSTEXPR(match_m512( + _mm512_mask_permutex2var_ps((__m512){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f, + -9.f, -10.f, -11.f, -12.f, -13.f, -14.f, -15.f, -16.f}, + 0xAAAA, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f, + 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}), + -1.f, -16.f, -3.f, 116.f, -5.f, 102.f, -7.f, 103.f, + -9.f, 104.f, -11.f, 105.f, -13.f, 106.f, -15.f, 107.f)); + +TEST_CONSTEXPR(match_v16si( + _mm512_permutex2var_epi32((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 1, 16, 101, 116, 2, 102, 3, 103, + 4, 104, 5, 105, 6, 106, 7, 107)); +TEST_CONSTEXPR(match_v16si( + _mm512_maskz_permutex2var_epi32(0x5555, + (__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 1, 0, 101, 0, 2, 0, 3, 0, + 4, 0, 5, 0, 6, 0, 7, 0)); + +TEST_CONSTEXPR(match_v8di( + _mm512_permutex2var_epi64((__m512i)(__v8di){1, 2, 3, 4, 5, 6, 7, 8}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}), + 1, 108, 1, 8, 2, 2, 3, 3)); +TEST_CONSTEXPR(match_v8di( + _mm512_mask_permutex2var_epi64((__m512i)(__v8di){-1, -2, -3, -4, -5, -6, -7, -8}, + 0xAA, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}), + -1, 108, -3, -8, -5, -2, -7, -3)); diff --git a/clang/test/CodeGen/X86/avx512vbmi-builtins.c b/clang/test/CodeGen/X86/avx512vbmi-builtins.c index c3b6298a39b59..7d506db92faeb 100644 --- a/clang/test/CodeGen/X86/avx512vbmi-builtins.c +++ b/clang/test/CodeGen/X86/avx512vbmi-builtins.c @@ -3,8 +3,14 @@ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + #include <immintrin.h> +#include "builtin_test_helpers.h" __m512i test_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) { // CHECK-LABEL: test_mm512_mask2_permutex2var_epi8 @@ -33,6 +39,154 @@ __m512i test_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i _ return _mm512_maskz_permutex2var_epi8(__U, __A, __I, __B); } +TEST_CONSTEXPR(match_v64qu( + _mm512_permutex2var_epi8((__m512i)(__v64qu){ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 0, 200, 1, 201, 2, 202, 3, 203, + 4, 204, 5, 205, 6, 206, 7, 207, + 8, 208, 9, 209, 10, 210, 11, 211, + 12, 212, 13, 213, 14, 214, 15, 215, + 16, 216, 17, 217, 18, 218, 19, 219, + 20, 220, 21, 221, 22, 222, 23, 223, + 24, 224, 25, 225, 26, 226, 27, 227, + 28, 228, 29, 229, 30, 230, 31, 231)); +TEST_CONSTEXPR(match_v64qu( + _mm512_mask_permutex2var_epi8((__m512i)(__v64qu){ + 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, + 58, 59, 60, 61, 62, 63, 64, 65, + 66, 67, 68, 69, 70, 71, 72, 73}, + 0xAAAAAAAAAAAAAAAAULL, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 10, 200, 12, 201, 14, 202, 16, 203, + 18, 204, 20, 205, 22, 206, 24, 207, + 26, 208, 28, 209, 30, 210, 32, 211, + 34, 212, 36, 213, 38, 214, 40, 215, + 42, 216, 44, 217, 46, 218, 48, 219, + 50, 220, 52, 221, 54, 222, 56, 223, + 58, 224, 60, 225, 62, 226, 64, 227, + 66, 228, 68, 229, 70, 230, 72, 231)); +TEST_CONSTEXPR(match_v64qu( + _mm512_maskz_permutex2var_epi8(0x5555555555555555ULL, + (__m512i)(__v64qu){ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 0, 0, 1, 0, 2, 0, 3, 0, + 4, 0, 5, 0, 6, 0, 7, 0, + 8, 0, 9, 0, 10, 0, 11, 0, + 12, 0, 13, 0, 14, 0, 15, 0, + 16, 0, 17, 0, 18, 0, 19, 0, + 20, 0, 21, 0, 22, 0, 23, 0, + 24, 0, 25, 0, 26, 0, 27, 0, + 28, 0, 29, 0, 30, 0, 31, 0)); +TEST_CONSTEXPR(match_v64qu( + _mm512_mask2_permutex2var_epi8((__m512i)(__v64qu){ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + 0x5555555555555555ULL, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95)); + __m512i test_mm512_permutexvar_epi8(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_permutexvar_epi8 // CHECK: call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %{{.*}}, <64 x i8> %{{.*}}) diff --git a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c index c4d5fc8fb6977..49b7a1a721195 100644 --- a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c +++ b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c @@ -3,8 +3,14 @@ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + #include <immintrin.h> +#include "builtin_test_helpers.h" __m128i test_mm_permutexvar_epi8(__m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_permutexvar_epi8 @@ -77,8 +83,28 @@ __m128i test_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, // CHECK-LABEL: test_mm_maskz_permutex2var_epi8 // CHECK: call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} - return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); -} + return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); +} + +TEST_CONSTEXPR(match_v16qu( + _mm_permutex2var_epi8((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}, + (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23}, + (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 1, 101, 2, 102, 3, 103, 4, 104, + 5, 105, 6, 106, 7, 107, 8, 108)); +TEST_CONSTEXPR(match_v16qu( + _mm_mask_permutex2var_epi8((__m128i)(__v16qu){200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215}, + 0xAAAA, + (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23}, + (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 200, 101, 202, 102, 204, 103, 206, 104, + 208, 105, 210, 106, 212, 107, 214, 108)); __m256i test_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_permutex2var_epi8 @@ -97,8 +123,44 @@ __m256i test_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i _ // CHECK-LABEL: test_mm256_maskz_permutex2var_epi8 // CHECK: call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} - return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); -} + return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); +} + +TEST_CONSTEXPR(match_v32qu( + _mm256_permutex2var_epi8((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32}, + (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132}), + 1, 101, 2, 102, 3, 103, 4, 104, + 5, 105, 6, 106, 7, 107, 8, 108, + 9, 109, 10, 110, 11, 111, 12, 112, + 13, 113, 14, 114, 15, 115, 16, 116)); +TEST_CONSTEXPR(match_v32qu( + _mm256_mask_permutex2var_epi8((__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231}, + 0xAAAAAAAA, + (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132}), + 200, 101, 202, 102, 204, 103, 206, 104, + 208, 105, 210, 106, 212, 107, 214, 108, + 216, 109, 218, 110, 220, 111, 222, 112, + 224, 113, 226, 114, 228, 115, 230, 116)); __m128i test_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) { // CHECK-LABEL: test_mm_mask_multishift_epi64_epi8 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 33c43977f72dc..121d5bf8d4adb 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -5610,12 +5610,23 @@ __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask2_permutex2var_epi32(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_mask2_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, 0x05, + (__m128i)(__v4si){100, 200, 300, 400}), + 10, 3, 100, 6)); __m256i test_mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { // CHECK-LABEL: test_mm256_mask2_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask2_permutex2var_epi32(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_mask2_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + 0xA5, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + 0, 7, 100, 15, 1, 110, 2, 120)); __m128d test_mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { // CHECK-LABEL: test_mm_mask2_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 @@ -5646,149 +5657,255 @@ __m128i test_mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask2_permutex2var_epi64(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_mask2_permutex2var_epi64((__m128i)(__v2di){10, 20}, + (__m128i)(__v2di){0, 5}, 0x1, + (__m128i)(__v2di){100, 200}), + 10, 5)); __m256i test_mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { // CHECK-LABEL: test_mm256_mask2_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask2_permutex2var_epi64(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_mask2_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30}, + (__m256i)(__v4di){0, 1, 4, 5}, 0x5, + (__m256i)(__v4di){100, 110, 120, 130}), + 0, 1, 100, 5)); __m128i test_mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 return _mm_permutex2var_epi32(__A,__I,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + 10, 40, 100, 300)); __m128i test_mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_mask_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_permutex2var_epi32(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + -1, -4, -3, 300)); __m128i test_mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_permutex2var_epi32(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_maskz_permutex2var_epi32(0x0A, (__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + 0, 40, 0, 300)); __m256i test_mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 return _mm256_permutex2var_epi32(__A,__I,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + 0, 70, 100, 170, 10, 110, 20, 120)); __m256i test_mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_permutex2var_epi32(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8}, 0xAA, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + -1, -8, -3, 170, -5, 110, -7, 120)); __m256i test_mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_permutex2var_epi32(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_maskz_permutex2var_epi32(0xAA, (__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + 0, 70, 0, 170, 0, 110, 0, 120)); __m128d test_mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { // CHECK-LABEL: test_mm_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 return _mm_permutex2var_pd(__A,__I,__B); } +TEST_CONSTEXPR(match_m128d( + _mm_permutex2var_pd((__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}), + 1.0, 10.0)); __m128d test_mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { // CHECK-LABEL: test_mm_mask_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_permutex2var_pd(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m128d( + _mm_mask_permutex2var_pd((__m128d){-1.0, -2.0}, 0x2, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}), + -1.0, 10.0)); __m128d test_mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_permutex2var_pd(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m128d( + _mm_maskz_permutex2var_pd(0x2, (__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}), + 0.0, 10.0)); __m256d test_mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { // CHECK-LABEL: test_mm256_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.256 return _mm256_permutex2var_pd(__A,__I,__B); } +TEST_CONSTEXPR(match_m256d( + _mm256_permutex2var_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}), + 1.0, 10.0, 2.0, 20.0)); __m256d test_mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.256 // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_permutex2var_pd(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m256d( + _mm256_mask_permutex2var_pd((__m256d){-1.0, -2.0, -3.0, -4.0}, 0x2, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}), + -1.0, 10.0, -3.0, -4.0)); __m256d test_mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.256 // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_permutex2var_pd(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m256d( + _mm256_maskz_permutex2var_pd(0x2, (__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}), + 0.0, 10.0, 0.0, 0.0)); __m128 test_mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { // CHECK-LABEL: test_mm_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.128 return _mm_permutex2var_ps(__A,__I,__B); } +TEST_CONSTEXPR(match_m128( + _mm_permutex2var_ps((__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}), + 1.f, 4.f, 10.f, 30.f)); __m128 test_mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { // CHECK-LABEL: test_mm_mask_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.128 // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_permutex2var_ps(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m128( + _mm_mask_permutex2var_ps((__m128){-1.f, -2.f, -3.f, -4.f}, 0x0A, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}), + -1.f, -4.f, -3.f, 30.f)); __m128 test_mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.128 // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_permutex2var_ps(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m128( + _mm_maskz_permutex2var_ps(0x0A, (__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}), + 0.f, 4.f, 0.f, 30.f)); __m256 test_mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { // CHECK-LABEL: test_mm256_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.256 return _mm256_permutex2var_ps(__A,__I,__B); } +TEST_CONSTEXPR(match_m256( + _mm256_permutex2var_ps((__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}), + 0.f, 7.f, 10.f, 17.f, 1.f, 11.f, 2.f, 12.f)); __m256 test_mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.256 // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_permutex2var_ps(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m256( + _mm256_mask_permutex2var_ps((__m256){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f}, 0xAA, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}), + -1.f, -8.f, -3.f, 17.f, -5.f, 11.f, -7.f, 12.f)); __m256 test_mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.256 // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_permutex2var_ps(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m256( + _mm256_maskz_permutex2var_ps(0xAA, (__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}), + 0.f, 7.f, 0.f, 17.f, 0.f, 11.f, 0.f, 12.f)); __m128i test_mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.128 return _mm_permutex2var_epi64(__A,__I,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_permutex2var_epi64((__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}), + 10, 200)); __m128i test_mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_mask_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.128 // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_permutex2var_epi64(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_mask_permutex2var_epi64((__m128i)(__v2di){-1, -2}, 0x2, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}), + -1, 200)); __m128i test_mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.128 // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_permutex2var_epi64(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_maskz_permutex2var_epi64(0x2, (__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}), + 0, 200)); __m256i test_mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 return _mm256_permutex2var_epi64(__A,__I,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}), + 0, 10, 100, 110)); __m256i test_mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_permutex2var_epi64(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_mask_permutex2var_epi64((__m256i)(__v4di){-1, -2, -3, -4}, 0x5, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}), + -1, -2, 100, -4)); __m256i test_mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_permutex2var_epi64(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_maskz_permutex2var_epi64(0x5, (__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}), + 0, 0, 100, 0)); +TEST_CONSTEXPR(match_v4si( + _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + 10, 40, 100, 300)); +TEST_CONSTEXPR(match_v4si( + _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + -1, -4, -3, 300)); __m128i test_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_cvtepi8_epi32 // CHECK: sext <4 x i8> %{{.*}} to <4 x i32> @@ -10472,6 +10589,17 @@ __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) { TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,0,0)); TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xAAu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 0,0,0,0, 0,4,0,4)); TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xFFu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,4,4)); +TEST_CONSTEXPR(match_v8si( + _mm256_permutex2var_epi32((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}), + 1, 8, 101, 108, 2, 102, 3, 103)); +TEST_CONSTEXPR(match_v8si( + _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8}, + 0xAA, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}), + -1, -8, -3, 108, -5, 102, -7, 103)); __m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { // CHECK-LABEL: test_mm_mask_mov_pd diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index febef46458ae9..172a3cb219c8a 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -1887,6 +1887,67 @@ __m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_permutex2var_epi16(__U,__A,__I,__B); } + +TEST_CONSTEXPR(match_v8hi( + _mm_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, 160, + 170}), + 0, 70, 100, 170, 10, 110, 20, 120)); +TEST_CONSTEXPR(match_v8hi( + _mm_mask_permutex2var_epi16((__m128i)(__v8hi){-1, -2, -3, -4, -5, -6, -7, -8}, + 0xAA, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, + 160, 170}), + -1, -8, -3, 170, -5, 110, -7, 120)); +TEST_CONSTEXPR(match_v8hi( + _mm_maskz_permutex2var_epi16(0xAA, + (__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, + 160, 170}), + 0, 70, 0, 170, 0, 110, 0, 120)); +TEST_CONSTEXPR(match_v8hi( + _mm_mask2_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + 0x55, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, + 160, 170}), + 0, 7, 100, 15, 10, 9, 20, 10)); +TEST_CONSTEXPR(match_v16hi( + _mm256_permutex2var_epi16( + (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 150, 200, 350, 10, 210, 20, 220, + 30, 230, 40, 240, 50, 250, 60, 260)); +TEST_CONSTEXPR(match_v16hi( + _mm256_mask_permutex2var_epi16( + (__m256i)(__v16hi){-1, -2, -3, -4, -5, -6, -7, -8, + -9, -10, -11, -12, -13, -14, -15, -16}, + 0xAAAA, + (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + -1, -16, -3, 350, -5, 210, -7, 220, + -9, 230, -11, 240, -13, 250, -15, 260)); +TEST_CONSTEXPR(match_v16hi( + _mm256_maskz_permutex2var_epi16( + 0x5555, + (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 0, 200, 0, 10, 0, 20, 0, + 30, 0, 40, 0, 50, 0, 60, 0)); + __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { // CHECK-LABEL: test_mm_mask_maddubs_epi16 // CHECK: @llvm.x86.ssse3.pmadd.ub.sw @@ -3596,3 +3657,22 @@ void test_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i _ // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.256 _mm256_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A); } + + +TEST_CONSTEXPR(match_v16qu( + _mm_permutex2var_epi8((__m128i)(__v16qu){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 127, 126, 125}, + (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23}, + (__m128i)(__v16qu){100, 110, 120, 130, 140, 150, 160, 170, + 180, 190, 200, 210, 220, 230, 240, 250}), + 0, 100, 10, 110, 20, 120, 30, 130, + 40, 140, 50, 150, 60, 160, 70, 170)); +TEST_CONSTEXPR(match_v32qu( + _mm256_permutex2var_epi8((__m256i)(__v32qu){0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109}, + (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47}, + (__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231}), + 0, 200, 10, 201, 20, 202, 30, 203, + 40, 204, 50, 205, 60, 206, 70, 207, + 80, 208, 90, 209, 100, 210, 110, 211, + 120, 212, 127, 213, 126, 214, 125, 215)); From 35ee3c6f72ba5aa26299d693f866385f23e4d330 Mon Sep 17 00:00:00 2001 From: Durgadoss R <durgadossr@nvidia.com> Date: Wed, 5 Nov 2025 15:38:24 +0530 Subject: [PATCH 294/313] [MLIR][NVVM] Update mbarrier Ops to use AnyTypeOf[] (2/n) (#165993) This is a follow up of PR #165558. (1/n) This patch updates the below mbarrier Ops to use AnyTypeOf[] construct: ``` * mbarrier.arrive * mbarrier.arrive.noComplete * mbarrier.test.wait * cp.async.mbarrier.arrive ``` * Updated existing tests accordingly. * Verified locally that there are no new regressions in the `integration` tests. * TODO: Two more Ops remain and will be migrated in a subsequent PR. Signed-off-by: Durgadoss R <durgadossr@nvidia.com> --- .../Optimizer/Builder/CUDAIntrinsicCall.cpp | 3 +- flang/test/Lower/CUDA/cuda-device-proc.cuf | 2 +- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 142 +++++++----------- .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp | 26 +--- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 82 ++++++++-- .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 10 +- .../Conversion/NVVMToLLVM/nvvm-to-llvm.mlir | 8 +- mlir/test/Dialect/LLVMIR/nvvm.mlir | 12 +- mlir/test/Target/LLVMIR/nvvmir.mlir | 4 +- 9 files changed, 152 insertions(+), 137 deletions(-) diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp index 4e276a72897fe..6312e61f5e62a 100644 --- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -891,8 +891,7 @@ CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType, assert(args.size() == 1); mlir::Value barrier = convertPtrToNVVMSpace( builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); - return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType, - barrier) + return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier) .getResult(); } diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 038aa0a058277..2d2c801b48f4d 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -440,7 +440,7 @@ end subroutine ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> -! CHECK: %{{.*}} = nvvm.mbarrier.arrive.shared %[[SHARED_PTR]] : !llvm.ptr<3> -> i64 +! CHECK: %{{.*}} = nvvm.mbarrier.arrive %[[SHARED_PTR]] : !llvm.ptr<3> -> i64 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 5f87e5c07e56e..10f0cc254ea97 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -658,8 +658,8 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">, } def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_AnyPointer:$addr)> { + Results<(outs I64:$res)>, + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> { let summary = "MBarrier Arrive Operation"; let description = [{ The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the @@ -676,36 +676,32 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">, value are implementation-specific. The operation takes the following operand: - - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. + - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) }]; - string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive, {$addr}); - }]; let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)"; -} -def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_PointerShared:$addr)> { - let summary = "Shared MBarrier Arrive Operation"; - let description = [{ - This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. - - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_shared, {$addr}); + auto [id, args] = NVVM::MBarrierArriveOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + $res = createIntrinsicCall(builder, id, args); }]; - let assemblyFormat = "$addr attr-dict `:` qualified(type($addr)) `->` type($res)"; } def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_AnyPointer:$addr, I32:$count)> { + Results<(outs I64:$res)>, + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, + I32:$count)> { let summary = "MBarrier Arrive No-Complete Operation"; let description = [{ The `nvvm.mbarrier.arrive.nocomplete` operation performs an arrive-on operation @@ -723,33 +719,29 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">, captures the phase of the *mbarrier object* prior to the arrive-on operation. The operation takes the following operands: - - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. + - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. - `count`: Integer specifying the count argument to the arrive-on operation. Must be in the valid range as specified in the *mbarrier object* contents. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) }]; - string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete, {$addr, $count}); - }]; - let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)"; -} -def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.shared">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> { - let summary = "Shared MBarrier Arrive No-Complete Operation"; - let description = [{ - This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. + let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)"; - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared, {$addr, $count}); + auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + $res = createIntrinsicCall(builder, id, args); }]; - let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)"; } def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">, @@ -898,8 +890,9 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p } def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_AnyPointer:$addr, LLVM_Type:$state)> { + Results<(outs I1:$res)>, + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, + I64:$state)> { let summary = "MBarrier Non-Blocking Test Wait Operation"; let description = [{ The `nvvm.mbarrier.test.wait` operation performs a non-blocking test for the @@ -946,26 +939,20 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">, [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait) }]; - string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state}); - }]; - let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)"; -} -def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_PointerShared:$addr, LLVM_Type:$state)> { - let summary = "Shared MBarrier Non-Blocking Test Wait Operation"; - let description = [{ - This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. + let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)"; - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait) + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state}); + auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + $res = createIntrinsicCall(builder, id, args); }]; - let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)"; } //===----------------------------------------------------------------------===// @@ -1541,47 +1528,30 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> { The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track all prior cp.async operations initiated by the executing thread. The `addr` operand specifies the address of the *mbarrier object* - in generic address space. The `noinc` attr impacts how the - mbarrier's state is updated. + in generic or shared::cta address space. When it is generic, the + underlying memory should fall within the shared::cta space; + otherwise the behavior is undefined. The `noinc` attr impacts + how the mbarrier's state is updated. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive) }]; - let assemblyFormat = "$addr attr-dict `:` type(operands)"; let arguments = (ins - LLVM_AnyPointer:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc); - - string llvmBuilder = [{ - auto intId = $noinc ? - llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc : - llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive; - - createIntrinsicCall(builder, intId, {$addr}); - }]; -} + AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, + DefaultValuedAttr<I1Attr, "0">:$noinc); -def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.shared"> { - let summary = "NVVM Dialect Op for cp.async.mbarrier.arrive.shared"; - let description = [{ - The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object* - track all prior cp.async operations initiated by the executing thread. - The `addr` operand specifies the address of the *mbarrier object* in - shared memory. The `noinc` attr impacts how the mbarrier's state - is updated. - - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive) - }]; let assemblyFormat = "$addr attr-dict `:` type(operands)"; - let arguments = (ins - LLVM_PointerShared:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc); + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); + }]; string llvmBuilder = [{ - auto intId = $noinc ? - llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared : - llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared; - - createIntrinsicCall(builder, intId, {$addr}); + auto [id, args] = NVVM::CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); }]; } diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index ec182f1db48ac..9348d3c172a07 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -865,13 +865,7 @@ struct NVGPUMBarrierArriveLowering adaptor.getMbarId(), rewriter); Type tokenType = getTypeConverter()->convertType( nvgpu::MBarrierTokenType::get(op->getContext())); - if (isMbarrierShared(op.getBarriers().getType())) { - rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType, - barrier); - } else { - rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType, - barrier); - } + rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType, barrier); return success(); } }; @@ -892,13 +886,8 @@ struct NVGPUMBarrierArriveNoCompleteLowering Type tokenType = getTypeConverter()->convertType( nvgpu::MBarrierTokenType::get(op->getContext())); Value count = truncToI32(b, adaptor.getCount()); - if (isMbarrierShared(op.getBarriers().getType())) { - rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>( - op, tokenType, barrier, count); - } else { - rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>( - op, tokenType, barrier, count); - } + rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>( + op, tokenType, barrier, count); return success(); } }; @@ -915,13 +904,8 @@ struct NVGPUMBarrierTestWaitLowering getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(), adaptor.getMbarId(), rewriter); Type retType = rewriter.getI1Type(); - if (isMbarrierShared(op.getBarriers().getType())) { - rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>( - op, retType, barrier, adaptor.getToken()); - } else { - rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>( - op, retType, barrier, adaptor.getToken()); - } + rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(op, retType, barrier, + adaptor.getToken()); return success(); } }; diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 262d9b753a2d7..d43f8815be16d 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -1752,15 +1752,21 @@ std::string NVVM::MBarrierInitOp::getPtx() { // getIntrinsicID/getIntrinsicIDAndArgs methods //===----------------------------------------------------------------------===// +static bool isPtrInAddrSpace(mlir::Value ptr, NVVMMemorySpace targetAS) { + auto ptrTy = llvm::cast<LLVM::LLVMPointerType>(ptr.getType()); + return ptrTy.getAddressSpace() == static_cast<unsigned>(targetAS); +} + +static bool isPtrInSharedCTASpace(mlir::Value ptr) { + return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared); +} + mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs( Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { auto thisOp = cast<NVVM::MBarrierInitOp>(op); - unsigned addressSpace = - llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType()) - .getAddressSpace(); - llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared) - ? llvm::Intrinsic::nvvm_mbarrier_init_shared - : llvm::Intrinsic::nvvm_mbarrier_init; + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = isShared ? llvm::Intrinsic::nvvm_mbarrier_init_shared + : llvm::Intrinsic::nvvm_mbarrier_init; // Fill the Intrinsic Args llvm::SmallVector<llvm::Value *> args; @@ -1773,16 +1779,72 @@ mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs( mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs( Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { auto thisOp = cast<NVVM::MBarrierInvalOp>(op); - unsigned addressSpace = - llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType()) - .getAddressSpace(); - llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared) + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = isShared ? llvm::Intrinsic::nvvm_mbarrier_inval_shared : llvm::Intrinsic::nvvm_mbarrier_inval; return {id, {mt.lookupValue(thisOp.getAddr())}}; } +mlir::NVVM::IDArgPair MBarrierArriveOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast<NVVM::MBarrierArriveOp>(op); + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = isShared + ? llvm::Intrinsic::nvvm_mbarrier_arrive_shared + : llvm::Intrinsic::nvvm_mbarrier_arrive; + + return {id, {mt.lookupValue(thisOp.getAddr())}}; +} + +mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast<NVVM::MBarrierArriveNocompleteOp>(op); + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = + isShared ? llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared + : llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete; + // Fill the Intrinsic Args + llvm::SmallVector<llvm::Value *> args; + args.push_back(mt.lookupValue(thisOp.getAddr())); + args.push_back(mt.lookupValue(thisOp.getCount())); + + return {id, std::move(args)}; +} + +mlir::NVVM::IDArgPair MBarrierTestWaitOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast<NVVM::MBarrierTestWaitOp>(op); + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = isShared + ? llvm::Intrinsic::nvvm_mbarrier_test_wait_shared + : llvm::Intrinsic::nvvm_mbarrier_test_wait; + // Fill the Intrinsic Args + llvm::SmallVector<llvm::Value *> args; + args.push_back(mt.lookupValue(thisOp.getAddr())); + args.push_back(mt.lookupValue(thisOp.getState())); + + return {id, std::move(args)}; +} + +mlir::NVVM::IDArgPair CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast<NVVM::CpAsyncMBarrierArriveOp>(op); + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + + llvm::Intrinsic::ID id; + if (thisOp.getNoinc()) { + id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared + : llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc; + } else { + id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared + : llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive; + } + + return {id, {mt.lookupValue(thisOp.getAddr())}}; +} + #define CP_ASYNC_ID_IMPL(mod, size, suffix) \ llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 8cce6308018e2..dcf4ddb2dd48c 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -491,12 +491,12 @@ func.func @mbarrier() { // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]] + // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive %[[barPtr2]] %token = nvgpu.mbarrier.arrive %barrier[%c0] : !barrierType -> !tokenType // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] + // CHECK: nvvm.mbarrier.test.wait %[[barPtr3]], %[[token]] %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType func.return @@ -521,12 +521,12 @@ func.func @mbarrier_nocomplete() { // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]] + // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete %[[barPtr2]] %token = nvgpu.mbarrier.arrive.nocomplete %barrier[%c0], %count : !barrierType -> !tokenType // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] + // CHECK: nvvm.mbarrier.test.wait %[[barPtr3]], %[[token]] %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType func.return @@ -572,7 +572,7 @@ func.func @mbarrier_wait(%barriers : !nvgpu.mbarrier.group<memorySpace = #gpu.ad // CHECK: %[[S3:.+]] = builtin.unrealized_conversion_cast %[[S2]] : index to i64 // CHECK: %[[S4:.+]] = llvm.extractvalue %[[CARG0]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[S5:.+]] = llvm.getelementptr %[[S4]][%[[S3]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 -// CHECK: nvvm.mbarrier.test.wait.shared {{.*}}, %[[CARG1]] +// CHECK: nvvm.mbarrier.test.wait {{.*}}, %[[CARG1]] %mbarId = arith.remui %i, %numBarriers : index %isDone = nvgpu.mbarrier.test.wait %barriers[%mbarId], %token : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 5>, !tokenType } diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index fbc4c0af60360..a9356c5cb60bb 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -88,10 +88,10 @@ func.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true} nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr - // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} - nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3> - // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true} - nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3> + // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} + nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3> + // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true} + nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3> llvm.return } diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir index 2505e56407c2b..cd7bd37da5763 100644 --- a/mlir/test/Dialect/LLVMIR/nvvm.mlir +++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir @@ -445,8 +445,8 @@ llvm.func private @mbarrier_arrive(%barrier: !llvm.ptr) { } llvm.func private @mbarrier_arrive_shared(%barrier: !llvm.ptr<3>) { - // CHECK: nvvm.mbarrier.arrive.shared %{{.*}} : !llvm.ptr<3> - %0 = nvvm.mbarrier.arrive.shared %barrier : !llvm.ptr<3> -> i64 + // CHECK: nvvm.mbarrier.arrive %{{.*}} : !llvm.ptr<3> + %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<3> -> i64 llvm.return } @@ -459,8 +459,8 @@ llvm.func private @mbarrier_arrive_nocomplete(%barrier: !llvm.ptr) { llvm.func private @mbarrier_arrive_nocomplete_shared(%barrier: !llvm.ptr<3>) { %count = nvvm.read.ptx.sreg.ntid.x : i32 - // CHECK: nvvm.mbarrier.arrive.nocomplete.shared %{{.*}} : !llvm.ptr<3> - %0 = nvvm.mbarrier.arrive.nocomplete.shared %barrier, %count : !llvm.ptr<3>, i32 -> i64 + // CHECK: nvvm.mbarrier.arrive.nocomplete %{{.*}} : !llvm.ptr<3> + %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr<3>, i32 -> i64 llvm.return } @@ -472,8 +472,8 @@ llvm.func private @mbarrier_test_wait(%barrier: !llvm.ptr, %token : i64) -> i1 { llvm.func private @mbarrier_test_wait_shared(%barrier: !llvm.ptr<3>, %token : i64) { %count = nvvm.read.ptx.sreg.ntid.x : i32 - // CHECK: nvvm.mbarrier.test.wait.shared %{{.*}} - %isComplete = nvvm.mbarrier.test.wait.shared %barrier, %token : !llvm.ptr<3>, i64 -> i1 + // CHECK: nvvm.mbarrier.test.wait %{{.*}} + %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr<3>, i64 -> i1 llvm.return } diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 9115de65ff0e8..3fc09f371a347 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -538,9 +538,9 @@ llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %{{.*}}) nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %{{.*}}) - nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3> + nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3> // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %{{.*}}) - nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3> + nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3> llvm.return } From 833983918d2f401886cee74174850b987eaf80b5 Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com> Date: Wed, 5 Nov 2025 11:11:08 +0100 Subject: [PATCH 295/313] Revert "CodeGen: Record MMOs in finalizeBundle" (#166520) Reverts llvm/llvm-project#166210 Buildbot failures in the libc on GPU bot: https://lab.llvm.org/buildbot/#/builders/10/builds/16711 --- llvm/lib/CodeGen/MIRParser/MIParser.cpp | 2 - llvm/lib/CodeGen/MachineInstrBundle.cpp | 6 - .../GlobalISel/insertelement-stack-lower.ll | 2 +- .../AMDGPU/GlobalISel/store-local.128.ll | 20 +- .../AMDGPU/GlobalISel/vni8-across-blocks.ll | 7 +- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 9954 +++++++++-------- .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 363 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 1503 +-- .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll | 295 +- .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 357 +- .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 370 +- .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 534 +- .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 839 +- .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 756 +- .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 1361 +-- llvm/test/CodeGen/AMDGPU/bf16.ll | 28 +- .../AMDGPU/buffer-fat-pointers-memcpy.ll | 49 +- .../CodeGen/AMDGPU/call-argument-types.ll | 12 +- llvm/test/CodeGen/AMDGPU/ds_write2.ll | 4 +- llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll | 4 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 63 +- llvm/test/CodeGen/AMDGPU/finalizebundle.mir | 52 - .../AMDGPU/gfx-callable-return-types.ll | 153 +- llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll | 4 +- .../CodeGen/AMDGPU/hard-clauses-img-gfx11.mir | 4 +- .../CodeGen/AMDGPU/hard-clauses-img-gfx12.mir | 4 +- ...llvm.amdgcn.ds.gws.barrier-fastregalloc.ll | 21 +- .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll | 2 +- .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 4 +- .../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll | 8 +- .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 156 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 18 +- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 6 +- llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 10 +- llvm/test/CodeGen/AMDGPU/load-global-i8.ll | 15 +- llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 214 +- ...er-buffer-fat-pointers-lastuse-metadata.ll | 31 +- ...uffer-fat-pointers-nontemporal-metadata.ll | 58 +- .../AMDGPU/lower-lds-struct-aa-memcpy.ll | 4 +- .../CodeGen/AMDGPU/lower-lds-struct-aa.ll | 8 +- llvm/test/CodeGen/AMDGPU/max.ll | 2 +- .../CodeGen/AMDGPU/memintrinsic-unroll.ll | 1971 ++-- llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll | 6 +- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 203 +- .../CodeGen/AMDGPU/postra-bundle-memops.mir | 5 +- .../postra-bundle-vimage-vsample-gfx12.mir | 4 +- .../AMDGPU/promote-constOffset-to-imm.ll | 41 +- llvm/test/CodeGen/AMDGPU/scratch-simple.ll | 2380 ++-- .../soft-clause-exceeds-register-budget.ll | 15 +- llvm/test/CodeGen/AMDGPU/spill-agpr.ll | 6 +- .../CodeGen/AMDGPU/spill-scavenge-offset.ll | 22 +- llvm/test/CodeGen/AMDGPU/stack-realign.ll | 2 +- .../Thumb2/mve-vpt-block-fold-vcmp.mir | 45 +- 53 files changed, 11293 insertions(+), 10710 deletions(-) diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 434a579c3be3f..4795d81e3f348 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -1161,8 +1161,6 @@ bool MIParser::parse(MachineInstr *&MI) { MemOperands.push_back(MemOp); if (Token.isNewlineOrEOF()) break; - if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace)) - break; if (Token.isNot(MIToken::comma)) return error("expected ',' before the next machine memory operand"); lex(); diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index a8dc614288f20..88d81993fbe55 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -137,7 +137,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, SmallSet<Register, 8> KilledUseSet; SmallSet<Register, 8> UndefUseSet; SmallVector<std::pair<Register, Register>> TiedOperands; - SmallVector<MachineInstr *> MemMIs; for (auto MII = FirstMI; MII != LastMI; ++MII) { // Debug instructions have no effects to track. if (MII->isDebugInstr()) @@ -201,9 +200,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, MIB.setMIFlag(MachineInstr::FrameSetup); if (MII->getFlag(MachineInstr::FrameDestroy)) MIB.setMIFlag(MachineInstr::FrameDestroy); - - if (MII->mayLoadOrStore()) - MemMIs.push_back(&*MII); } for (Register Reg : LocalDefs) { @@ -229,8 +225,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, assert(UseIdx < ExternUses.size()); MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx); } - - MIB->cloneMergedMemRefs(MF, MemMIs); } /// finalizeBundle - Same functionality as the previous finalizeBundle except diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index 6076a2eec44bc..c2129c20e4543 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -33,6 +33,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: v_mov_b32_e32 v13, s49 ; GCN-NEXT: v_mov_b32_e32 v14, s50 ; GCN-NEXT: v_mov_b32_e32 v15, s51 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 @@ -50,7 +51,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:56 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:60 ; GCN-NEXT: v_mov_b32_e32 v0, s52 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64 ; GCN-NEXT: v_mov_b32_e32 v0, s53 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 10e83b70a57d4..1812e17800e71 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -189,11 +189,15 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_lshr_b32 s8, s2, 16 +; GFX10-NEXT: s_and_b32 s9, 0xffff, s2 ; GFX10-NEXT: s_lshr_b32 s5, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: s_lshr_b32 s0, s7, 8 ; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 +; GFX10-NEXT: s_lshr_b32 s1, s9, 8 ; GFX10-NEXT: v_mov_b32_e32 v8, s5 ; GFX10-NEXT: v_mov_b32_e32 v9, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 @@ -204,22 +208,18 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:1 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:5 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-NEXT: s_and_b32 s9, 0xffff, s2 -; GFX10-NEXT: s_lshr_b32 s0, s2, 24 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 -; GFX10-NEXT: s_lshr_b32 s1, s9, 8 ; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v10, s1 +; GFX10-NEXT: s_lshr_b32 s0, s2, 24 +; GFX10-NEXT: ds_write_b8 v1, v7 offset:7 +; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 +; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s3 -; GFX10-NEXT: v_mov_b32_e32 v10, s1 -; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: s_lshr_b32 s1, s3, 16 +; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-NEXT: ds_write_b8 v1, v7 offset:7 -; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 -; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: s_lshr_b32 s0, s3, 24 ; GFX10-NEXT: v_mov_b32_e32 v4, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index 4a22a911c60b7..b33b8a7d8cd72 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -272,6 +272,10 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16 +; GFX906-NEXT: s_nop 0 +; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32 +; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48 ; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64 ; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80 ; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96 @@ -284,9 +288,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240 -; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16 -; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32 -; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 746ffcff5667a..74552a500ac51 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -3105,6 +3105,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 @@ -3237,22 +3253,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -3284,13 +3284,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -3522,6 +3523,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB12_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -3944,24 +3946,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -4309,12 +4295,44 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32i32_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -4419,22 +4437,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -4540,129 +4542,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 @@ -5111,10 +5113,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5254,8 +5255,15 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 ; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -5272,23 +5280,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32i32_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -5305,6 +5302,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 @@ -5437,6 +5437,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -5492,7 +5493,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(45) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -5507,7 +5508,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(47) +; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -5519,147 +5520,149 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 @@ -5667,9 +5670,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 @@ -5697,7 +5698,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 -; GFX9-NEXT: s_waitcnt vmcnt(44) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 @@ -6005,25 +6006,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 @@ -6295,6 +6280,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6754,11 +6755,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -6779,6 +6776,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -7415,7 +7416,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -10665,7 +10666,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8 @@ -11598,7 +11599,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s35, v16, 3 ; GFX11-NEXT: v_readlane_b32 s34, v16, 2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8 @@ -11811,26 +11812,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -11991,30 +11979,44 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -12023,11 +12025,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -12630,6 +12632,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB14_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -12643,8 +12646,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -13324,25 +13327,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -13479,20 +13470,34 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -13978,6 +13983,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -14555,27 +14561,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -14717,20 +14709,34 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -15217,6 +15223,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -16355,7 +16362,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -16388,7 +16395,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x12 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -17329,7 +17336,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB14_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -17362,7 +17369,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -18079,13 +18086,24 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -18096,22 +18114,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -18121,6 +18127,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -18715,13 +18722,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -18949,11 +18956,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -18963,8 +18970,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -18972,8 +18982,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -19182,6 +19190,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -19199,12 +19213,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -19214,7 +19222,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -19227,6 +19235,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -19811,8 +19820,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -19991,18 +20000,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -20029,8 +20036,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -20046,16 +20054,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -20067,11 +20073,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20084,12 +20089,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -20103,22 +20106,17 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -20134,24 +20132,45 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -20202,6 +20221,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -20215,18 +20246,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -20664,7 +20683,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -20697,7 +20716,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x7 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -21554,7 +21573,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB15_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -21587,7 +21606,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x7 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -21605,7 +21624,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -21638,7 +21657,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x7 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -22495,7 +22514,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB15_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -22528,7 +22547,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x7 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -23091,25 +23110,10 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 @@ -23288,6 +23292,22 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -26109,10 +26129,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -26129,6 +26146,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -26694,7 +26714,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -29161,7 +29181,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -29194,7 +29214,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -29227,7 +29247,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x6 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -30029,7 +30049,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -30062,7 +30082,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -30095,7 +30115,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x6 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -30135,7 +30155,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -30168,7 +30188,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -30201,7 +30221,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x8 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -30893,7 +30913,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -30926,7 +30946,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -30959,7 +30979,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -31768,22 +31788,6 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -31803,6 +31807,22 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -32473,25 +32493,10 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -32504,6 +32509,21 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] @@ -34712,7 +34732,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -34745,7 +34765,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -34778,7 +34798,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -34856,7 +34876,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -34889,7 +34909,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -34922,7 +34942,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -34980,10 +35000,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-LABEL: bitcast_v32i32_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -35000,6 +35016,10 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -35031,13 +35051,14 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB24_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -35082,6 +35103,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB24_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 @@ -35308,22 +35330,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 @@ -35350,7 +35356,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -35363,6 +35369,22 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -36316,13 +36338,7 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -36354,6 +36370,12 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -36369,6 +36391,7 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -36585,6 +36608,7 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -37758,7 +37782,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -37791,7 +37815,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -37824,7 +37848,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -37902,7 +37926,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -37935,7 +37959,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -37968,7 +37992,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -40009,6 +40033,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 @@ -40141,22 +40181,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -40188,13 +40212,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB36_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -40426,6 +40451,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB36_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -40848,24 +40874,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -41213,12 +41223,44 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v32f32_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -41323,22 +41365,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -41444,129 +41470,129 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 @@ -42015,10 +42041,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -42158,8 +42183,15 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 ; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -42176,23 +42208,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v32f32_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -42209,6 +42230,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 @@ -42341,6 +42365,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -42396,7 +42421,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(45) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -42411,7 +42436,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(47) +; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -42423,147 +42448,149 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 @@ -42571,9 +42598,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 @@ -42601,7 +42626,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_cbranch_execz .LBB36_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 -; GFX9-NEXT: s_waitcnt vmcnt(44) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 @@ -42909,25 +42934,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 @@ -43199,6 +43208,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -43641,11 +43666,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -43666,6 +43687,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -44285,7 +44310,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -44745,11 +44770,27 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8 +; SI-NEXT: v_add_f32_e64 v53, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v52, s22, 1.0 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8 ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 @@ -44801,33 +44842,24 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v12 -; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 -; SI-NEXT: v_add_f32_e64 v53, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v52, s22, 1.0 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v16 -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v21 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v21 -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v26 @@ -44836,8 +44868,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v26 ; SI-NEXT: v_add_f32_e64 v41, s21, 1.0 ; SI-NEXT: v_add_f32_e64 v40, s20, 1.0 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v26 @@ -44845,7 +44875,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_add_f32_e64 v57, s16, 1.0 ; SI-NEXT: v_add_f32_e64 v46, s19, 1.0 ; SI-NEXT: v_add_f32_e64 v45, s18, 1.0 -; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8 ; SI-NEXT: v_lshr_b64 v[31:32], v[40:41], 16 ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -44856,8 +44885,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: v_lshr_b64 v[27:28], v[40:41], 24 ; SI-NEXT: v_lshr_b64 v[33:34], v[45:46], 24 ; SI-NEXT: v_lshr_b64 v[38:39], v[45:46], 8 @@ -45381,33 +45408,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v13, s98 -; SI-NEXT: v_mov_b32_e32 v27, s62 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s46 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s56 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s58 +; SI-NEXT: v_mov_b32_e32 v27, s62 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v13, s46 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s72 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v13, s56 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s74 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v13, s58 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s76 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v57, s16 @@ -45441,7 +45468,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v3, s6 ; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: v_readlane_b32 s5, v61, 1 -; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v13, s60 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v27, s78 @@ -45668,22 +45694,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v29 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload @@ -45796,16 +45809,17 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 @@ -46056,6 +46070,19 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -46660,10 +46687,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: s_branch .LBB37_2 ; VI-NEXT: .LBB37_4: -; VI-NEXT: v_mov_b32_e32 v53, s46 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s56 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 0 ; VI-NEXT: v_mov_b32_e32 v48, s4 @@ -46741,9 +46764,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s58 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 27 @@ -46821,9 +46841,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 51 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s60 ; VI-NEXT: v_readlane_b32 s4, v62, 52 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 @@ -46842,6 +46859,40 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 57 ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: v_mov_b32_e32 v53, s46 +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s56 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s58 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s60 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s62 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s72 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s74 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s76 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s78 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s88 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s90 +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, s16 ; VI-NEXT: v_mov_b32_e32 v32, s17 ; VI-NEXT: v_mov_b32_e32 v29, s18 @@ -46895,35 +46946,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_mov_b32_e32 v42, s82 ; VI-NEXT: v_mov_b32_e32 v37, s81 ; VI-NEXT: v_mov_b32_e32 v50, s80 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v53, s30 +; VI-NEXT: v_mov_b32_e32 v54, s34 ; VI-NEXT: v_mov_b32_e32 v39, s36 ; VI-NEXT: v_mov_b32_e32 v40, s38 ; VI-NEXT: v_mov_b32_e32 v41, s48 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s62 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s72 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s74 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s76 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s78 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s88 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s90 -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v53, s30 -; VI-NEXT: v_mov_b32_e32 v54, s34 ; VI-NEXT: .LBB37_5: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v34 ; VI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 @@ -46991,20 +47018,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v50 ; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v36 -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s87, v63, 31 ; VI-NEXT: v_readlane_b32 s86, v63, 30 ; VI-NEXT: v_readlane_b32 s85, v63, 29 @@ -47037,7 +47050,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v24, vcc, 36, v0 @@ -47328,6 +47341,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload @@ -48096,8 +48123,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 49 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, s4 +; GFX9-NEXT: v_mov_b32_e32 v49, s52 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, s46 +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -48146,7 +48175,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, s94 -; GFX9-NEXT: v_mov_b32_e32 v49, s52 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill @@ -48194,7 +48222,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v54, s55 ; GFX9-NEXT: v_mov_b32_e32 v50, s53 ; GFX9-NEXT: v_mov_b32_e32 v60, s54 -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v49, s51 ; GFX9-NEXT: v_mov_b32_e32 v59, s50 ; GFX9-NEXT: v_mov_b32_e32 v58, s49 @@ -48264,20 +48291,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v49 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s99, v63, 35 ; GFX9-NEXT: v_readlane_b32 s98, v63, 34 ; GFX9-NEXT: v_readlane_b32 s97, v63, 33 @@ -48314,7 +48327,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:20 @@ -48608,6 +48621,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload @@ -48619,7 +48646,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 @@ -48654,7 +48681,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 vcc_hi, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-NEXT: s_clause 0x12 ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 @@ -49574,7 +49601,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-NEXT: s_clause 0x12 ; GFX11-NEXT: scratch_load_b32 v74, off, s32 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 @@ -49636,7 +49663,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; GFX11-NEXT: v_readlane_b32 s31, v75, 1 ; GFX11-NEXT: v_readlane_b32 s30, v75, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 @@ -49849,26 +49876,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -50029,30 +50043,44 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -50061,11 +50089,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -50668,6 +50696,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB38_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -50681,8 +50710,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -51362,25 +51391,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -51517,20 +51534,34 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -52016,6 +52047,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -52593,27 +52625,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -52755,20 +52773,34 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -53255,6 +53287,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -54393,7 +54426,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -54426,7 +54459,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x12 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -55367,7 +55400,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB38_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -55400,7 +55433,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -56117,13 +56150,24 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB39_3 ; SI-NEXT: .LBB39_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -56134,22 +56178,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB39_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -56159,6 +56191,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -56753,13 +56786,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -56987,11 +57020,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -57001,8 +57034,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -57010,8 +57046,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -57220,6 +57254,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB39_3 ; VI-NEXT: .LBB39_2: +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -57237,12 +57277,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -57252,7 +57286,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -57265,6 +57299,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -57849,8 +57884,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -58029,18 +58064,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -58067,8 +58100,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -58084,16 +58118,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -58105,11 +58137,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -58122,12 +58153,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -58141,22 +58170,17 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -58172,24 +58196,45 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -58240,6 +58285,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB39_3 ; GFX9-NEXT: .LBB39_2: +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -58253,18 +58310,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -58702,7 +58747,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -58735,7 +58780,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x7 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -59592,7 +59637,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB39_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -59625,7 +59670,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x7 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -59643,7 +59688,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -59676,7 +59721,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x7 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -60533,7 +60578,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB39_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -60566,7 +60611,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x7 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -61129,25 +61174,10 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 @@ -61326,6 +61356,22 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -62094,20 +62140,6 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -62144,9 +62176,10 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 @@ -62160,6 +62193,20 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -64192,10 +64239,7 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -64212,6 +64256,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -64777,7 +64824,7 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -67244,7 +67291,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -67277,7 +67324,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -67310,7 +67357,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x6 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -68112,7 +68159,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -68145,7 +68192,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -68178,7 +68225,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x6 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -68218,7 +68265,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -68251,7 +68298,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -68284,7 +68331,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x8 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -68976,7 +69023,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -69009,7 +69056,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -69042,7 +69089,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -69851,22 +69898,6 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -69886,6 +69917,22 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -70556,6 +70603,11 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -70572,11 +70624,6 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: @@ -72766,7 +72813,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -72799,7 +72846,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -72832,7 +72879,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -72910,7 +72957,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -72943,7 +72990,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -72976,7 +73023,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a, ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -73034,10 +73081,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-LABEL: bitcast_v32f32_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -73054,6 +73097,10 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -73085,13 +73132,14 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; kill: killed $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB48_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -73136,6 +73184,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB48_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 @@ -73362,22 +73411,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 @@ -73404,7 +73437,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -73417,6 +73450,22 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -73902,25 +73951,9 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 @@ -73946,6 +73979,22 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: @@ -74324,13 +74373,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -74362,6 +74405,12 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -74377,6 +74426,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -74593,6 +74643,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -75766,7 +75817,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -75799,7 +75850,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -75832,7 +75883,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -75910,7 +75961,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -75943,7 +75994,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -75976,7 +76027,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -77003,6 +77054,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 @@ -77135,22 +77202,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -77182,13 +77233,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB56_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -77449,6 +77501,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 24 @@ -77842,24 +77895,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload @@ -78207,12 +78244,44 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16i64_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -78317,22 +78386,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr59 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -78438,129 +78491,129 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 @@ -79009,10 +79062,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -79152,8 +79204,15 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 ; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -79170,23 +79229,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16i64_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -79203,6 +79251,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr40 @@ -79335,6 +79386,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: ; kill: killed $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -79390,7 +79442,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(45) +; GFX9-NEXT: s_waitcnt vmcnt(29) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -79405,7 +79457,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(47) +; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -79417,147 +79469,149 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 @@ -79565,9 +79619,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 @@ -79624,7 +79676,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v28, vcc ; GFX9-NEXT: v_add_co_u32_e32 v29, vcc, 3, v29 ; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, 0, v30, vcc -; GFX9-NEXT: s_waitcnt vmcnt(44) +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: v_add_co_u32_e32 v31, vcc, 3, v31 ; GFX9-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v32, vcc ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] @@ -79903,25 +79955,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 @@ -80193,6 +80229,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -80660,11 +80712,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -80685,6 +80733,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -81329,7 +81381,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -84579,7 +84631,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GFX11-NEXT: scratch_store_b32 off, v17, s32 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v18, s32 offset:8 @@ -85514,7 +85566,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s35, v16, 3 ; GFX11-NEXT: v_readlane_b32 s34, v16, 2 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b32 v16, off, s32 ; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:8 @@ -85727,26 +85779,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -85907,30 +85946,44 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -85939,11 +85992,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -86546,6 +86599,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -86559,8 +86613,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -87240,25 +87294,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -87395,20 +87437,34 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -87894,6 +87950,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -88471,27 +88528,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -88633,20 +88676,34 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -89133,6 +89190,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -90271,7 +90329,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -90304,7 +90362,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x12 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -91245,7 +91303,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB58_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -91278,7 +91336,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -91995,13 +92053,24 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB59_3 ; SI-NEXT: .LBB59_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -92012,22 +92081,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB59_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -92037,6 +92094,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -92631,13 +92689,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -92865,11 +92923,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -92879,8 +92937,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -92888,8 +92949,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -93098,6 +93157,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB59_3 ; VI-NEXT: .LBB59_2: +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -93115,12 +93180,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -93130,7 +93189,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -93143,6 +93202,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -93727,8 +93787,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -93907,18 +93967,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -93945,8 +94003,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -93962,16 +94021,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -93983,11 +94040,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -94000,12 +94056,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -94019,22 +94073,17 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -94050,24 +94099,45 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -94118,6 +94188,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB59_3 ; GFX9-NEXT: .LBB59_2: +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -94131,18 +94213,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -94580,7 +94650,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -94613,7 +94683,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x7 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -95470,7 +95540,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB59_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -95503,7 +95573,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x7 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -95521,7 +95591,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -95554,7 +95624,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x7 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -96411,7 +96481,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB59_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -96444,7 +96514,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x7 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -97008,25 +97078,10 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 @@ -97205,6 +97260,22 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -100013,10 +100084,7 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -100033,6 +100101,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -100598,7 +100669,7 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -103065,7 +103136,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -103098,7 +103169,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -103131,7 +103202,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x6 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -103933,7 +104004,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -103966,7 +104037,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -103999,7 +104070,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x6 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -104039,7 +104110,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -104072,7 +104143,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -104105,7 +104176,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x8 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -104797,7 +104868,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -104830,7 +104901,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -104863,7 +104934,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -105669,22 +105740,6 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -105704,6 +105759,22 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -106382,25 +106453,10 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -106413,6 +106469,21 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] @@ -108629,7 +108700,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -108662,7 +108733,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -108695,7 +108766,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -108773,7 +108844,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -108806,7 +108877,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -108839,7 +108910,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -108897,10 +108968,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-LABEL: bitcast_v16i64_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -108917,6 +108984,10 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -108948,13 +109019,14 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB68_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -109027,6 +109099,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v32, vcc, 3, v32 ; SI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc ; SI-NEXT: v_alignbit_b32 v33, v31, v32, 16 @@ -109223,22 +109296,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 @@ -109265,7 +109322,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -109278,6 +109335,22 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -110247,13 +110320,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -110285,6 +110352,12 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -110300,6 +110373,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -110516,6 +110590,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -111689,7 +111764,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -111722,7 +111797,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -111755,7 +111830,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -111833,7 +111908,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -111866,7 +111941,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -111899,7 +111974,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -111957,6 +112032,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v128i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -112089,22 +112180,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr42 @@ -112136,13 +112211,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -112373,6 +112449,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB72_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_alignbit_b32 v33, v32, v31, 24 @@ -112750,24 +112827,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload @@ -113145,12 +113206,44 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v16f64_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -113253,22 +113346,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr39 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; kill: killed $vgpr39 ; VI-NEXT: ; implicit-def: $vgpr39 @@ -113371,132 +113448,132 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 ; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 @@ -113932,10 +114009,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -114075,7 +114151,17 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v42 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -114092,26 +114178,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v16f64_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -114128,6 +114200,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; kill: killed $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr41 @@ -114260,6 +114335,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; kill: killed $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill @@ -114319,7 +114395,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(47) +; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -114332,7 +114408,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(49) +; GFX9-NEXT: s_waitcnt vmcnt(33) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 @@ -114340,151 +114416,152 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill @@ -114494,7 +114571,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v12 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9 @@ -114523,7 +114599,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB72_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: s_waitcnt vmcnt(30) ; GFX9-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 @@ -114828,24 +114904,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -115110,6 +115170,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -115552,11 +115628,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -115577,6 +115649,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -116196,7 +116272,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -116980,11 +117056,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v33, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 39 ; SI-NEXT: v_mov_b32_e32 v30, s4 -; SI-NEXT: v_mov_b32_e32 v29, s46 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s98 ; SI-NEXT: v_readlane_b32 s4, v61, 40 ; SI-NEXT: v_mov_b32_e32 v34, s4 ; SI-NEXT: v_readlane_b32 s4, v61, 41 @@ -117077,10 +117148,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, s96 ; SI-NEXT: v_readlane_b32 s4, v62, 0 ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -117137,69 +117204,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v25, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 14 -; SI-NEXT: v_mov_b32_e32 v60, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 15 -; SI-NEXT: v_mov_b32_e32 v31, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 16 -; SI-NEXT: v_mov_b32_e32 v32, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 17 -; SI-NEXT: v_mov_b32_e32 v18, s5 -; SI-NEXT: v_mov_b32_e32 v46, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 0 -; SI-NEXT: v_readlane_b32 s5, v61, 1 -; SI-NEXT: v_mov_b32_e32 v59, s17 -; SI-NEXT: v_mov_b32_e32 v58, s16 -; SI-NEXT: v_mov_b32_e32 v45, s19 -; SI-NEXT: v_mov_b32_e32 v44, s18 -; SI-NEXT: v_mov_b32_e32 v53, s21 -; SI-NEXT: v_mov_b32_e32 v52, s20 -; SI-NEXT: v_mov_b32_e32 v39, s23 -; SI-NEXT: v_mov_b32_e32 v38, s22 -; SI-NEXT: v_mov_b32_e32 v24, s25 -; SI-NEXT: v_mov_b32_e32 v23, s24 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_mov_b32_e32 v16, s7 -; SI-NEXT: v_mov_b32_e32 v15, s6 -; SI-NEXT: v_mov_b32_e32 v14, s9 +; SI-NEXT: v_mov_b32_e32 v29, s46 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s98 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s96 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s86 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v12, s11 -; SI-NEXT: v_mov_b32_e32 v11, s10 -; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v8, s15 -; SI-NEXT: v_mov_b32_e32 v7, s14 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v5, s40 -; SI-NEXT: v_mov_b32_e32 v4, s43 -; SI-NEXT: v_mov_b32_e32 v3, s42 -; SI-NEXT: v_mov_b32_e32 v2, s45 -; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v28, s38 -; SI-NEXT: v_mov_b32_e32 v27, s36 -; SI-NEXT: v_mov_b32_e32 v26, s34 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v25, s30 -; SI-NEXT: v_mov_b32_e32 v56, s94 -; SI-NEXT: v_mov_b32_e32 v55, s92 -; SI-NEXT: v_mov_b32_e32 v54, s90 -; SI-NEXT: v_mov_b32_e32 v42, s88 -; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v40, s76 -; SI-NEXT: v_mov_b32_e32 v50, s74 -; SI-NEXT: v_mov_b32_e32 v49, s72 -; SI-NEXT: v_mov_b32_e32 v48, s62 -; SI-NEXT: v_mov_b32_e32 v47, s60 -; SI-NEXT: v_mov_b32_e32 v36, s58 -; SI-NEXT: v_mov_b32_e32 v35, s56 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) @@ -117242,108 +117260,165 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v29, s50 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s4, v62, 14 +; SI-NEXT: v_mov_b32_e32 v60, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 15 +; SI-NEXT: v_mov_b32_e32 v31, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 16 +; SI-NEXT: v_mov_b32_e32 v32, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 17 +; SI-NEXT: v_mov_b32_e32 v18, s5 +; SI-NEXT: v_mov_b32_e32 v46, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 0 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 2 -; SI-NEXT: v_readlane_b32 s5, v61, 3 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 1 +; SI-NEXT: v_readlane_b32 s4, v61, 2 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 4 -; SI-NEXT: v_readlane_b32 s5, v61, 5 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 3 +; SI-NEXT: v_readlane_b32 s4, v61, 4 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 6 -; SI-NEXT: v_readlane_b32 s5, v61, 7 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 5 +; SI-NEXT: v_readlane_b32 s4, v61, 6 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 8 -; SI-NEXT: v_readlane_b32 s5, v61, 9 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 7 +; SI-NEXT: v_readlane_b32 s4, v61, 8 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 10 -; SI-NEXT: v_readlane_b32 s5, v61, 11 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 9 +; SI-NEXT: v_readlane_b32 s4, v61, 10 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 12 -; SI-NEXT: v_readlane_b32 s5, v61, 13 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 11 +; SI-NEXT: v_readlane_b32 s4, v61, 12 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 14 -; SI-NEXT: v_readlane_b32 s5, v61, 15 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 13 +; SI-NEXT: v_readlane_b32 s4, v61, 14 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 16 -; SI-NEXT: v_readlane_b32 s5, v61, 17 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 15 +; SI-NEXT: v_readlane_b32 s4, v61, 16 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 18 -; SI-NEXT: v_readlane_b32 s5, v61, 19 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 17 +; SI-NEXT: v_readlane_b32 s4, v61, 18 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 20 -; SI-NEXT: v_readlane_b32 s5, v61, 21 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 19 +; SI-NEXT: v_readlane_b32 s4, v61, 20 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 22 -; SI-NEXT: v_readlane_b32 s5, v61, 23 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 21 +; SI-NEXT: v_readlane_b32 s4, v61, 22 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 24 -; SI-NEXT: v_readlane_b32 s5, v61, 25 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 23 +; SI-NEXT: v_readlane_b32 s4, v61, 24 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 26 -; SI-NEXT: v_readlane_b32 s5, v61, 27 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 25 +; SI-NEXT: v_readlane_b32 s4, v61, 26 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 28 -; SI-NEXT: v_readlane_b32 s5, v61, 29 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 27 +; SI-NEXT: v_readlane_b32 s4, v61, 28 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 30 -; SI-NEXT: v_readlane_b32 s5, v61, 31 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 29 +; SI-NEXT: v_readlane_b32 s4, v61, 30 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 32 -; SI-NEXT: v_readlane_b32 s5, v61, 33 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s48 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 31 +; SI-NEXT: v_readlane_b32 s4, v61, 32 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v59, s17 +; SI-NEXT: v_mov_b32_e32 v58, s16 +; SI-NEXT: v_mov_b32_e32 v45, s19 +; SI-NEXT: v_mov_b32_e32 v44, s18 +; SI-NEXT: v_mov_b32_e32 v53, s21 +; SI-NEXT: v_mov_b32_e32 v52, s20 +; SI-NEXT: v_mov_b32_e32 v39, s23 +; SI-NEXT: v_mov_b32_e32 v38, s22 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v16, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v14, s9 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v12, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v8, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v5, s40 +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v3, s42 +; SI-NEXT: v_mov_b32_e32 v2, s45 +; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: v_mov_b32_e32 v28, s38 +; SI-NEXT: v_mov_b32_e32 v27, s36 +; SI-NEXT: v_mov_b32_e32 v26, s34 +; SI-NEXT: v_mov_b32_e32 v25, s30 +; SI-NEXT: v_mov_b32_e32 v56, s94 +; SI-NEXT: v_mov_b32_e32 v55, s92 +; SI-NEXT: v_mov_b32_e32 v54, s90 +; SI-NEXT: v_mov_b32_e32 v42, s88 +; SI-NEXT: v_mov_b32_e32 v41, s78 +; SI-NEXT: v_mov_b32_e32 v40, s76 +; SI-NEXT: v_mov_b32_e32 v50, s74 +; SI-NEXT: v_mov_b32_e32 v49, s72 +; SI-NEXT: v_mov_b32_e32 v48, s62 +; SI-NEXT: v_mov_b32_e32 v47, s60 +; SI-NEXT: v_mov_b32_e32 v36, s58 +; SI-NEXT: v_mov_b32_e32 v35, s56 +; SI-NEXT: v_readlane_b32 s5, v61, 33 ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: .LBB73_5: ; %end @@ -117636,9 +117711,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 @@ -117927,6 +118002,15 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -117940,15 +118024,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload @@ -118615,10 +118690,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 11 ; VI-NEXT: v_mov_b32_e32 v41, s4 -; VI-NEXT: v_mov_b32_e32 v40, s48 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s38 ; VI-NEXT: v_readlane_b32 s4, v62, 12 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 @@ -118656,9 +118727,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 25 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s36 ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill @@ -118696,9 +118764,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 37 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s34 ; VI-NEXT: v_readlane_b32 s4, v62, 38 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 @@ -118714,6 +118779,52 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 42 ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 +; VI-NEXT: v_mov_b32_e32 v40, s48 +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s38 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s36 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s34 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s30 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s90 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s88 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s78 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s76 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s74 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s72 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s62 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s60 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s58 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s56 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 43 ; VI-NEXT: v_mov_b32_e32 v53, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 44 @@ -118723,7 +118834,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 46 ; VI-NEXT: v_mov_b32_e32 v58, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 48 ; VI-NEXT: v_mov_b32_e32 v54, s4 @@ -118736,17 +118846,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_readlane_b32 s4, v62, 52 ; VI-NEXT: v_mov_b32_e32 v39, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 53 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s30 ; VI-NEXT: v_mov_b32_e32 v49, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 54 ; VI-NEXT: v_mov_b32_e32 v61, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 55 ; VI-NEXT: v_mov_b32_e32 v36, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 +; VI-NEXT: v_mov_b32_e32 v40, s46 ; VI-NEXT: v_mov_b32_e32 v55, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v12, s5 ; VI-NEXT: v_mov_b32_e32 v1, s44 ; VI-NEXT: v_mov_b32_e32 v2, s45 @@ -118776,48 +118886,13 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_mov_b32_e32 v28, s21 ; VI-NEXT: v_mov_b32_e32 v29, s18 ; VI-NEXT: v_mov_b32_e32 v30, s19 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s90 ; VI-NEXT: v_mov_b32_e32 v31, s16 ; VI-NEXT: v_mov_b32_e32 v32, s17 ; VI-NEXT: v_mov_b32_e32 v42, s70 ; VI-NEXT: v_mov_b32_e32 v50, s4 +; VI-NEXT: v_mov_b32_e32 v40, v43 ; VI-NEXT: v_mov_b32_e32 v46, v38 ; VI-NEXT: v_mov_b32_e32 v38, v34 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s88 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s78 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s76 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s74 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s72 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s62 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s60 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s58 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s56 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, s46 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v40, v43 ; VI-NEXT: .LBB73_5: ; %end ; VI-NEXT: v_lshlrev_b32_e32 v34, 8, v42 ; VI-NEXT: v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -119216,7 +119291,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -119231,9 +119309,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload @@ -119831,12 +119906,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: ; implicit-def: $sgpr46 ; GFX9-NEXT: s_branch .LBB73_2 ; GFX9-NEXT: .LBB73_4: -; GFX9-NEXT: v_mov_b32_e32 v41, s66 -; GFX9-NEXT: v_mov_b32_e32 v40, s36 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s34 ; GFX9-NEXT: v_mov_b32_e32 v15, s81 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s71 @@ -119913,10 +119982,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 9 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s30 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 10 @@ -119975,10 +120040,71 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 28 ; GFX9-NEXT: v_mov_b32_e32 v29, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 29 +; GFX9-NEXT: v_mov_b32_e32 v41, s66 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v15, s4 -; GFX9-NEXT: v_readlane_b32 s4, v62, 30 +; GFX9-NEXT: v_mov_b32_e32 v40, s36 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s34 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s30 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s94 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s92 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s90 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s88 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s78 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s76 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s74 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s72 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s62 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s60 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s58 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v40, s56 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_readlane_b32 s4, v62, 30 ; GFX9-NEXT: v_mov_b32_e32 v15, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 31 ; GFX9-NEXT: v_mov_b32_e32 v44, s4 @@ -119993,10 +120119,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 36 ; GFX9-NEXT: v_mov_b32_e32 v55, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 37 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s94 ; GFX9-NEXT: v_mov_b32_e32 v61, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 38 ; GFX9-NEXT: v_mov_b32_e32 v42, s4 @@ -120021,6 +120143,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s4, v62, 48 ; GFX9-NEXT: v_mov_b32_e32 v60, s4 ; GFX9-NEXT: v_readlane_b32 s4, v62, 49 +; GFX9-NEXT: v_mov_b32_e32 v40, s46 ; GFX9-NEXT: v_mov_b32_e32 v12, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s44 ; GFX9-NEXT: v_mov_b32_e32 v2, s45 @@ -120058,54 +120181,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v54, s64 ; GFX9-NEXT: v_mov_b32_e32 v52, s54 ; GFX9-NEXT: v_mov_b32_e32 v25, s4 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s92 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s90 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s88 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s78 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s76 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s74 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s72 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s62 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s60 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s58 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s56 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v40, s46 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -120127,8 +120202,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v45 ; GFX9-NEXT: v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v56 ; GFX9-NEXT: v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v22 @@ -120179,45 +120252,46 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v44 ; GFX9-NEXT: v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v29 ; GFX9-NEXT: v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v30 ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v51 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v19, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v59 ; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v49 ; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:12 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v19, v25, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:8 ; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120231,11 +120305,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:24 @@ -120247,23 +120319,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:32 @@ -120285,7 +120343,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v16, v53, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -120513,6 +120574,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -120524,7 +120599,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 ; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:88 @@ -120559,7 +120634,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: v_writelane_b32 v77, s101, 5 ; GFX11-NEXT: s_mov_b32 vcc_hi, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Spill +; GFX11-NEXT: s_clause 0x13 ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 @@ -121467,7 +121542,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x13 ; 80-byte Folded Reload +; GFX11-NEXT: s_clause 0x13 ; GFX11-NEXT: scratch_load_b32 v75, off, s32 ; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:8 @@ -121530,7 +121605,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; GFX11-NEXT: v_readlane_b32 s31, v76, 1 ; GFX11-NEXT: v_readlane_b32 s30, v76, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 ; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:88 @@ -121743,26 +121818,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 @@ -121923,30 +121985,44 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload @@ -121955,11 +122031,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload @@ -122562,6 +122638,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB74_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload @@ -122575,8 +122652,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 @@ -123256,25 +123333,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -123411,20 +123476,34 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -123910,6 +123989,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) ; VI-NEXT: v_add_u16_e32 v9, 3, v61 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -124487,27 +124567,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -124649,20 +124715,34 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -125149,6 +125229,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(13) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload @@ -126287,7 +126368,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:592 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:588 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:584 @@ -126320,7 +126401,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:468 -; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x12 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:456 @@ -127261,7 +127342,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v31, v35, v36 ; GFX11-FAKE16-NEXT: .LBB74_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v138, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v137, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v136, off, s32 offset:400 @@ -127294,7 +127375,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:528 @@ -128011,13 +128092,24 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB75_3 ; SI-NEXT: .LBB75_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v55, v56 ; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload @@ -128028,22 +128120,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: .LBB75_3: ; %Flow -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v35, v57 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload @@ -128053,6 +128133,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -128647,13 +128728,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 @@ -128881,11 +128962,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -128895,8 +128976,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload @@ -128904,8 +128988,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -129114,6 +129196,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB75_3 ; VI-NEXT: .LBB75_2: +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v44, v56 ; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_mov_b32_e32 v50, v40 @@ -129131,12 +129219,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v54, v53 ; VI-NEXT: v_mov_b32_e32 v52, v36 ; VI-NEXT: v_mov_b32_e32 v49, v51 @@ -129146,7 +129228,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v51, v41 ; VI-NEXT: v_mov_b32_e32 v36, v44 ; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_mov_b32_e32 v54, v60 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -129159,6 +129241,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: ; %bb.4: ; %cmp.true ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff @@ -129743,8 +129826,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -129923,18 +130006,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload @@ -129961,8 +130042,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -129978,16 +130060,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload @@ -129999,11 +130079,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v61, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -130016,12 +130095,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v37, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload @@ -130035,22 +130112,17 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -130066,24 +130138,45 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(11) ; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -130134,6 +130227,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB75_3 ; GFX9-NEXT: .LBB75_2: +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v33, v43 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload @@ -130147,18 +130252,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v38, v51 -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v35, v62 ; GFX9-NEXT: v_mov_b32_e32 v36, v31 ; GFX9-NEXT: v_mov_b32_e32 v40, v30 @@ -130596,7 +130689,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -130629,7 +130722,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x7 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -131486,7 +131579,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-TRUE16-NEXT: .LBB75_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -131519,7 +131612,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-TRUE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x7 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -131537,7 +131630,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:476 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:472 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:468 @@ -131570,7 +131663,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:360 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:356 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:352 -; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x7 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:348 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:344 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:340 @@ -132427,7 +132520,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-FAKE16-NEXT: .LBB75_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v109, off, s32 offset:328 @@ -132460,7 +132553,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:444 -; GFX11-FAKE16-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x7 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:456 @@ -132495,6 +132588,22 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v64bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -132563,22 +132672,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -132610,7 +132703,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -132620,7 +132713,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -132750,6 +132843,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB76_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_and_b32_e32 v35, 0xffff0000, v32 @@ -132987,25 +133081,10 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 @@ -133184,6 +133263,22 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -133871,22 +133966,8 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -133974,6 +134055,20 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload @@ -135976,10 +136071,7 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -135996,6 +136088,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:8 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -136561,7 +136656,7 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63 -; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -139028,7 +139123,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:272 @@ -139061,7 +139156,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:164 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:160 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:156 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:152 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:144 @@ -139094,7 +139189,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v157, s32 offset:36 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v158, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v159, s32 offset:28 -; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x6 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v168, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v169, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v170, s32 offset:16 @@ -139896,7 +139991,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v174, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v173, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v172, off, s32 offset:8 @@ -139929,7 +140024,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:136 @@ -139962,7 +140057,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:252 -; GFX11-TRUE16-NEXT: s_clause 0x6 ; 28-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x6 ; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:260 ; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:264 @@ -140002,7 +140097,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:288 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:284 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:280 @@ -140035,7 +140130,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:172 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:168 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:164 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:160 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:156 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:152 @@ -140068,7 +140163,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v157, s32 offset:44 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v158, s32 offset:40 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v159, s32 offset:36 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x8 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v168, s32 offset:32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v169, s32 offset:28 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v170, s32 offset:24 @@ -140760,7 +140855,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169 ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v184, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v175, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v174, off, s32 offset:8 @@ -140793,7 +140888,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v123, off, s32 offset:116 ; GFX11-FAKE16-NEXT: scratch_load_b32 v122, off, s32 offset:120 ; GFX11-FAKE16-NEXT: scratch_load_b32 v121, off, s32 offset:124 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v120, off, s32 offset:128 ; GFX11-FAKE16-NEXT: scratch_load_b32 v111, off, s32 offset:132 ; GFX11-FAKE16-NEXT: scratch_load_b32 v110, off, s32 offset:136 @@ -140826,7 +140921,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:244 ; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:248 ; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:252 -; GFX11-FAKE16-NEXT: s_clause 0x8 ; 36-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:256 ; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:260 ; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:264 @@ -140883,6 +140978,22 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v64f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -140951,22 +141062,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr47 @@ -140998,7 +141093,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -141049,6 +141144,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 @@ -141218,6 +141314,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 @@ -141565,24 +141662,8 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -141631,6 +141712,22 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -142275,22 +142372,6 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -142310,6 +142391,22 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB81_4: @@ -144470,7 +144567,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -144503,7 +144600,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -144536,7 +144633,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -144614,7 +144711,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -144647,7 +144744,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -144680,7 +144777,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -144738,10 +144835,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-LABEL: bitcast_v16f64_to_v64i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -144758,6 +144851,10 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -144789,13 +144886,14 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB84_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v33, v32, v31, 16 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v30, v29, 16 @@ -144839,6 +144937,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB84_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 @@ -145050,22 +145149,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 @@ -145092,7 +145175,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -145105,6 +145188,22 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -145508,23 +145607,7 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 @@ -145585,6 +145668,22 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB85_4: @@ -145932,13 +146031,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 @@ -145970,6 +146063,12 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -145985,6 +146084,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v41 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) @@ -146201,6 +146301,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload @@ -147374,7 +147475,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:292 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:288 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:284 @@ -147407,7 +147508,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:176 ; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:172 ; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:168 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:164 ; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:160 ; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:156 @@ -147440,7 +147541,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_store_b32 off, v157, s32 offset:48 ; GFX11-NEXT: scratch_store_b32 off, v158, s32 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v159, s32 offset:40 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Spill +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_store_b32 off, v168, s32 offset:36 ; GFX11-NEXT: scratch_store_b32 off, v169, s32 offset:32 ; GFX11-NEXT: scratch_store_b32 off, v170, s32 offset:28 @@ -147518,7 +147619,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173 ; GFX11-NEXT: v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171 ; GFX11-NEXT: v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v185, off, s32 ; GFX11-NEXT: scratch_load_b32 v184, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v175, off, s32 offset:8 @@ -147551,7 +147652,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:120 ; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:124 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:132 ; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:136 @@ -147584,7 +147685,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:244 ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:248 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:252 -; GFX11-NEXT: s_clause 0x9 ; 40-byte Folded Reload +; GFX11-NEXT: s_clause 0x9 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:256 ; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:260 ; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:264 @@ -147794,8 +147895,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v25 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr9 @@ -147805,7 +147904,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 @@ -147845,39 +147944,38 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 @@ -147893,12 +147991,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -147920,6 +148017,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 @@ -147927,15 +148032,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:304 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v1 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v2 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 @@ -147944,11 +148045,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:336 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:328 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1 ; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v1 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v2 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 @@ -147958,7 +148057,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:364 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:360 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v3 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v1 @@ -149458,25 +149557,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -149773,6 +149857,22 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -149840,8 +149940,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 @@ -149937,25 +150037,13 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -150083,19 +150171,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150103,6 +150186,26 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -150111,57 +150214,35 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150194,19 +150275,39 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -150372,9 +150473,17 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -151059,8 +151168,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 @@ -151171,27 +151280,13 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -151324,19 +151419,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151344,6 +151434,26 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -151352,62 +151462,36 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151430,25 +151514,49 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -151614,9 +151722,17 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 @@ -152962,7 +153078,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 @@ -152995,7 +153111,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 @@ -153824,7 +153940,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB88_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 @@ -153857,7 +153973,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 @@ -153902,6 +154018,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 @@ -153912,7 +154029,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: s_mov_b32 s72, s21 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_writelane_b32 v43, s19, 0 ; SI-NEXT: v_writelane_b32 v43, s18, 1 ; SI-NEXT: v_writelane_b32 v43, s17, 2 @@ -153953,16 +154070,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s86, 30 ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 -; SI-NEXT: s_mov_b32 s79, s26 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: v_writelane_b32 v41, s99, 35 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: s_mov_b32 s79, s26 ; SI-NEXT: v_readfirstlane_b32 s38, v20 ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s39, v19 @@ -153989,7 +154100,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s18, v5 ; SI-NEXT: v_readfirstlane_b32 s19, v6 ; SI-NEXT: v_readfirstlane_b32 s88, v4 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: v_readfirstlane_b32 s90, v9 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s6, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 @@ -153997,31 +154110,33 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: v_writelane_b32 v43, s4, 4 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; SI-NEXT: v_writelane_b32 v43, s4, 5 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v43, s4, 6 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v43, s4, 7 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 ; SI-NEXT: v_writelane_b32 v43, s4, 8 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 ; SI-NEXT: v_writelane_b32 v43, s4, 9 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: v_writelane_b32 v43, s4, 10 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_readfirstlane_b32 s90, v9 ; SI-NEXT: v_readfirstlane_b32 s91, v10 ; SI-NEXT: v_readfirstlane_b32 s92, v8 ; SI-NEXT: v_readfirstlane_b32 s93, v7 @@ -154104,41 +154219,44 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s24, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s78, v34 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v43, s4, 18 -; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 ; SI-NEXT: v_writelane_b32 v43, s4, 19 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: v_writelane_b32 v43, s4, 20 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 21 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: v_writelane_b32 v43, s4, 22 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 ; SI-NEXT: v_writelane_b32 v43, s4, 23 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v48 ; SI-NEXT: v_writelane_b32 v43, s4, 24 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 ; SI-NEXT: v_writelane_b32 v43, s4, 25 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 ; SI-NEXT: v_writelane_b32 v43, s4, 26 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: v_writelane_b32 v43, s4, 27 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 @@ -154152,7 +154270,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; SI-NEXT: v_writelane_b32 v43, s4, 28 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v52 ; SI-NEXT: v_writelane_b32 v43, s4, 29 ; SI-NEXT: v_readfirstlane_b32 s4, v53 @@ -154161,6 +154279,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v43, s4, 31 ; SI-NEXT: v_readfirstlane_b32 s4, v55 ; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v40 ; SI-NEXT: v_writelane_b32 v43, s4, 33 ; SI-NEXT: v_writelane_b32 v43, s22, 34 @@ -155775,53 +155894,33 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -155866,6 +155965,52 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -155885,7 +156030,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill @@ -155894,6 +156038,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill @@ -155925,25 +156070,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB89_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload @@ -155968,18 +156094,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload @@ -156029,11 +156152,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -156041,37 +156163,50 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v42, v43 ; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -156086,12 +156221,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -156113,28 +156249,21 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v63, v39 -; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -156152,10 +156281,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -156188,6 +156318,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_branch .LBB89_3 ; VI-NEXT: .LBB89_2: ; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -156208,7 +156339,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v58, v7 ; VI-NEXT: v_mov_b32_e32 v57, v5 ; VI-NEXT: v_mov_b32_e32 v56, v3 @@ -156800,51 +156930,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -156908,42 +157016,82 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -156964,13 +157112,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(55) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB89_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -157224,13 +157365,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -157240,6 +157382,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: .LBB89_2: ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -157251,7 +157394,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v34, v35 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v49, v39 @@ -157717,7 +157859,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1e ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -158447,7 +158589,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l ; GFX11-TRUE16-NEXT: .LBB89_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1e ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -158489,7 +158631,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1e ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -159273,7 +159415,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB89_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1e ; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -161295,23 +161437,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v58 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload @@ -161358,15 +161484,28 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -161383,6 +161522,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr34 @@ -161571,165 +161713,166 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12] ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18 -; VI-NEXT: v_mov_b32_e32 v45, v46 -; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11 -; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, v46 +; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30] +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31 ; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v46, v63 ; VI-NEXT: v_mov_b32_e32 v63, v50 ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10 ; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v8 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 ; VI-NEXT: v_mov_b32_e32 v51, v57 ; VI-NEXT: v_mov_b32_e32 v50, v56 ; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 ; VI-NEXT: v_mov_b32_e32 v57, v43 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[23:24] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[21:22] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[19:20] -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18] ; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 ; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 @@ -161742,7 +161885,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 ; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 ; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 ; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 @@ -162376,27 +162518,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28 ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27 -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26 ; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 @@ -162781,10 +162923,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -162879,6 +163020,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -162895,16 +163046,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -163141,11 +163282,49 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill @@ -163159,7 +163338,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(44) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62 @@ -163177,168 +163355,130 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v6 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[62:63] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill @@ -163431,11 +163571,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc ; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v59, v32 ; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v21 +; GFX9-NEXT: v_mov_b32_e32 v58, v31 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc @@ -163590,7 +163735,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: s_waitcnt vmcnt(50) +; GFX9-NEXT: s_waitcnt vmcnt(52) ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v62 ; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v44, v18, v19, vcc @@ -163605,6 +163750,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 ; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 @@ -163745,10 +163891,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v9 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_mov_b32_e32 v59, v32 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_mov_b32_e32 v58, v31 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 @@ -163814,7 +163958,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 ; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v61, v28, v0, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v41, vcc ; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6 @@ -163822,7 +163965,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc ; GFX9-NEXT: v_perm_b32 v41, v13, v0, s7 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 @@ -163851,14 +163994,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v15, v31, v45, vcc -; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7 ; GFX9-NEXT: v_perm_b32 v31, v15, v26, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 +; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill @@ -163878,19 +164031,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 ; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v36, v44, v29, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v29 ; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22 ; GFX9-NEXT: v_perm_b32 v38, v21, v43, s7 @@ -163899,24 +164045,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v20 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43 -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; GFX9-NEXT: v_perm_b32 v51, v6, v17, s7 ; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 @@ -163924,8 +164052,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v57 +; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43 ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v47 -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -163969,6 +164101,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v56 ; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill @@ -164001,51 +164134,74 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[35:36] -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32 ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v32 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v31 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v31 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v42 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v42 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v41 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v55 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v55 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63] ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v39 ; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v53 @@ -164058,26 +164214,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44] ; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 ; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill @@ -164086,9 +164231,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v60 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v60 -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill @@ -164113,33 +164255,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v63, v16 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v35 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 ; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v62, v15 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; GFX9-NEXT: v_mov_b32_e32 v62, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v44 ; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v44 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v43 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v43 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v59 ; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v58 -; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v60 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v61 ; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v60 @@ -164154,10 +164294,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32 ; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -164166,8 +164302,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v39 ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51 @@ -164176,29 +164310,38 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52 ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 ; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 ; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v60 ; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v31 ; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -164373,13 +164516,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:76 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:76 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v55 ; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:80 @@ -164390,13 +164533,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:84 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:84 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 ; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:88 @@ -164536,11 +164679,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:240 @@ -164573,7 +164712,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:124 -; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1b ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:112 @@ -164602,6 +164741,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:12 +; GFX11-TRUE16-NEXT: s_clause 0x2 +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 @@ -165635,7 +165778,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-TRUE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:20 @@ -165668,7 +165811,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:136 -; GFX11-TRUE16-NEXT: s_clause 0x1b ; 112-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1b ; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:140 ; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:144 ; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:148 @@ -165703,11 +165846,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x15 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:92 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:88 @@ -165730,6 +165869,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v76, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v77, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr76 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr75 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66 @@ -166848,7 +166991,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x15 ; 88-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x15 ; GFX11-FAKE16-NEXT: scratch_load_b32 v77, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v76, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:20 @@ -168520,26 +168663,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_lshl_b32 s5, s86, 24 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s86, v63, 30 ; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -168578,6 +168708,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -169780,15 +169924,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v43, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 13 ; VI-NEXT: v_mov_b32_e32 v46, s4 -; VI-NEXT: v_mov_b32_e32 v45, s72 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s74 -; VI-NEXT: v_mov_b32_e32 v42, s54 -; VI-NEXT: v_mov_b32_e32 v41, s46 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v41, s56 ; VI-NEXT: v_readlane_b32 s4, v62, 14 ; VI-NEXT: v_mov_b32_e32 v50, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 15 @@ -169814,11 +169949,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readlane_b32 s4, v62, 22 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v45, s76 ; VI-NEXT: v_readlane_b32 s4, v62, 23 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 @@ -169864,8 +169994,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_readlane_b32 s4, v62, 37 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 38 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 @@ -169924,47 +170052,45 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 +; VI-NEXT: v_mov_b32_e32 v42, s54 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_mov_b32_e32 v36, s66 +; VI-NEXT: v_mov_b32_e32 v41, s46 ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s56 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s58 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v41, s60 +; VI-NEXT: v_mov_b32_e32 v45, s72 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, s74 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, s76 +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v45, s78 ; VI-NEXT: v_mov_b32_e32 v55, s88 -; VI-NEXT: v_mov_b32_e32 v35, s30 -; VI-NEXT: v_mov_b32_e32 v41, s58 ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v35, s85 -; VI-NEXT: v_mov_b32_e32 v34, s38 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v36, s66 ; VI-NEXT: v_mov_b32_e32 v52, s64 -; VI-NEXT: v_mov_b32_e32 v59, s87 -; VI-NEXT: v_mov_b32_e32 v41, s60 ; VI-NEXT: v_mov_b32_e32 v55, v50 +; VI-NEXT: v_mov_b32_e32 v35, s30 +; VI-NEXT: v_mov_b32_e32 v59, s87 ; VI-NEXT: v_mov_b32_e32 v58, s34 ; VI-NEXT: v_mov_b32_e32 v45, s36 -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v50, v46 -; VI-NEXT: v_mov_b32_e32 v46, v48 -; VI-NEXT: v_mov_b32_e32 v48, v47 -; VI-NEXT: v_mov_b32_e32 v47, v56 -; VI-NEXT: v_mov_b32_e32 v56, v51 -; VI-NEXT: v_mov_b32_e32 v51, s90 -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v34, s48 +; VI-NEXT: v_mov_b32_e32 v34, s38 ; VI-NEXT: v_mov_b32_e32 v1, s44 ; VI-NEXT: v_mov_b32_e32 v2, s45 ; VI-NEXT: v_mov_b32_e32 v3, s42 @@ -169997,19 +170123,37 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v30, s29 ; VI-NEXT: v_mov_b32_e32 v32, s5 ; VI-NEXT: v_mov_b32_e32 v41, s62 -; VI-NEXT: v_mov_b32_e32 v51, v53 -; VI-NEXT: v_mov_b32_e32 v53, v54 -; VI-NEXT: v_mov_b32_e32 v54, v40 -; VI-NEXT: v_mov_b32_e32 v40, s80 ; VI-NEXT: v_mov_b32_e32 v57, s81 ; VI-NEXT: v_mov_b32_e32 v37, s84 -; VI-NEXT: v_mov_b32_e32 v58, s50 ; VI-NEXT: v_mov_b32_e32 v60, s52 ; VI-NEXT: v_mov_b32_e32 v38, s51 ; VI-NEXT: v_mov_b32_e32 v61, s65 ; VI-NEXT: v_mov_b32_e32 v49, s66 -; VI-NEXT: v_mov_b32_e32 v45, s53 ; VI-NEXT: v_mov_b32_e32 v39, s55 +; VI-NEXT: v_mov_b32_e32 v50, v46 +; VI-NEXT: v_mov_b32_e32 v46, v48 +; VI-NEXT: v_mov_b32_e32 v48, v47 +; VI-NEXT: v_mov_b32_e32 v47, v56 +; VI-NEXT: v_mov_b32_e32 v56, v51 +; VI-NEXT: v_mov_b32_e32 v51, s90 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, s85 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, s48 +; VI-NEXT: v_mov_b32_e32 v51, v53 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: v_mov_b32_e32 v54, v40 +; VI-NEXT: v_mov_b32_e32 v40, s80 +; VI-NEXT: v_mov_b32_e32 v58, s50 +; VI-NEXT: v_mov_b32_e32 v45, s53 ; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: .LBB91_5: ; %end @@ -170318,10 +170462,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -170399,23 +170542,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -170433,6 +170562,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -172021,7 +172164,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -172036,10 +172183,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload @@ -172051,7 +172194,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 @@ -173601,7 +173744,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-TRUE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 @@ -173614,7 +173757,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x3 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:8 @@ -175171,7 +175314,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-FAKE16-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x3 ; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 ; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:8 @@ -175345,6 +175488,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 @@ -175362,9 +175508,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 @@ -175382,15 +175525,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 @@ -175526,37 +175669,34 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 @@ -175576,10 +175716,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 @@ -175589,9 +175726,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 @@ -175617,6 +175752,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -175739,6 +175882,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 @@ -176396,18 +176540,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 @@ -177114,24 +177265,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -177438,6 +177574,22 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -177505,8 +177657,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 @@ -177602,25 +177754,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -177748,19 +177888,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177768,6 +177903,26 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -177776,57 +177931,35 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -177859,19 +177992,39 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -178037,9 +178190,17 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -178724,8 +178885,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 @@ -178836,27 +178997,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -178989,19 +179136,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179009,6 +179151,26 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -179017,62 +179179,36 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179095,25 +179231,49 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -179279,9 +179439,17 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 @@ -180627,7 +180795,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 @@ -180660,7 +180828,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 @@ -181489,7 +181657,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB92_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 @@ -181522,7 +181690,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 @@ -183064,6 +183232,17 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v60 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload @@ -183077,17 +183256,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload @@ -183347,53 +183515,33 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -183438,6 +183586,52 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -183457,7 +183651,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill @@ -183466,6 +183659,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill @@ -183497,25 +183691,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB93_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload @@ -183540,18 +183715,15 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload @@ -183601,11 +183773,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -183613,37 +183784,50 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v42, v43 ; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -183658,12 +183842,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -183685,28 +183870,21 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v63, v39 -; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -183724,10 +183902,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -183760,6 +183939,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_branch .LBB93_3 ; VI-NEXT: .LBB93_2: ; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -183780,7 +183960,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v58, v7 ; VI-NEXT: v_mov_b32_e32 v57, v5 ; VI-NEXT: v_mov_b32_e32 v56, v3 @@ -184372,51 +184551,29 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -184480,42 +184637,82 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -184536,13 +184733,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(55) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB93_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -184796,13 +184986,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -184812,6 +185003,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: .LBB93_2: ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -184823,7 +185015,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v34, v35 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v49, v39 @@ -185289,7 +185480,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1e ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -186019,7 +186210,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l ; GFX11-TRUE16-NEXT: .LBB93_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1e ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -186061,7 +186252,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1e ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -186845,7 +187036,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB93_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1e ; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -188862,6 +189053,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -188878,13 +189076,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -188907,42 +189098,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v44, v12 ; VI-NEXT: v_mov_b32_e32 v12, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v32, v20 ; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v43, v11 ; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; VI-NEXT: v_mov_b32_e32 v32, v20 ; VI-NEXT: v_mov_b32_e32 v55, v22 ; VI-NEXT: v_mov_b32_e32 v54, v21 ; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v43, v11 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v44 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 ; VI-NEXT: ; implicit-def: $vgpr20 ; VI-NEXT: ; implicit-def: $vgpr57 ; VI-NEXT: ; implicit-def: $vgpr51 -; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: ; implicit-def: $vgpr4 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr56 ; VI-NEXT: ; implicit-def: $vgpr63 @@ -188954,38 +189130,47 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr42 ; VI-NEXT: ; implicit-def: $vgpr45 ; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 -; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; VI-NEXT: ; implicit-def: $vgpr8 ; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr13 ; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr30 -; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 @@ -188994,6 +189179,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr29 ; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr34 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr0 @@ -189037,34 +189254,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; VI-NEXT: ; kill: killed $vgpr0 ; VI-NEXT: ; implicit-def: $vgpr0 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr24 -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr23 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr10 @@ -189102,49 +189293,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v56, v38 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v45, v7 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v53 ; VI-NEXT: v_mov_b32_e32 v15, v3 -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v28, v48 ; VI-NEXT: v_mov_b32_e32 v48, v16 ; VI-NEXT: v_mov_b32_e32 v16, v40 ; VI-NEXT: v_mov_b32_e32 v47, v39 -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v63, v53 -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v32 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v44 ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v32 ; VI-NEXT: v_lshrrev_b32_e32 v13, 24, v18 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v38 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v37 @@ -189156,20 +189326,83 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v62, v36 +; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v10 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v7 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v7 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v62, v36 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[37:38] @@ -189184,94 +189417,61 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 -; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50 -; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 -; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40 -; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53 -; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53 -; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 -; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 -; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34 +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34] +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36 +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36] ; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[52:53] ; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[58:59] -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; VI-NEXT: v_mov_b32_e32 v53, v63 -; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40 -; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 +; VI-NEXT: v_mov_b32_e32 v27, v19 +; VI-NEXT: v_mov_b32_e32 v34, v14 +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55 ; VI-NEXT: v_mov_b32_e32 v7, v45 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; VI-NEXT: v_mov_b32_e32 v3, v15 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_mov_b32_e32 v38, v56 +; VI-NEXT: v_mov_b32_e32 v29, v41 ; VI-NEXT: v_mov_b32_e32 v45, v60 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55 ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59 -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3] -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26 -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36 -; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36 -; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 ; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36] +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40 ; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[49:50] ; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[39:40] ; VI-NEXT: v_mov_b32_e32 v58, v51 -; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34] ; VI-NEXT: v_mov_b32_e32 v36, v62 ; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[54:55] -; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v34, v14 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40 ; VI-NEXT: v_mov_b32_e32 v40, v16 ; VI-NEXT: v_mov_b32_e32 v16, v48 ; VI-NEXT: v_mov_b32_e32 v48, v28 ; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55 -; VI-NEXT: v_mov_b32_e32 v3, v15 -; VI-NEXT: v_mov_b32_e32 v15, v29 -; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v38, v56 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v39 -; VI-NEXT: v_mov_b32_e32 v29, v41 ; VI-NEXT: v_mov_b32_e32 v39, v47 ; VI-NEXT: v_mov_b32_e32 v47, v4 ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v54 -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55 ; VI-NEXT: .LBB94_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_4 ; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v63, 0x200 ; VI-NEXT: v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 @@ -189290,47 +189490,36 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 ; VI-NEXT: v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_e32 v14, v31, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 ; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_e32 v62, v55, v0 ; VI-NEXT: v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 ; VI-NEXT: v_or_b32_e32 v61, v54, v0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v26, v54 ; VI-NEXT: v_mov_b32_e32 v27, v55 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 ; VI-NEXT: v_or_b32_e32 v34, v25, v0 ; VI-NEXT: v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v33, v24, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -189338,21 +189527,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v36, v2, v0 ; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v35, v1, v0 ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill @@ -189361,34 +189542,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v38, v2, v0 ; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v37, v1, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 ; VI-NEXT: v_or_b32_e32 v49, v9, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 ; VI-NEXT: v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v48, v8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v31 ; VI-NEXT: v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v10, v32 @@ -189406,11 +189591,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v53, v2, v0 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v44, 0x200, v44 ; VI-NEXT: v_or_b32_e32 v52, v1, v0 @@ -189427,32 +189612,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v46, v2, v0 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; VI-NEXT: v_or_b32_e32 v45, v1, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v5, v7, v0 ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v4, v6, v0 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 @@ -189460,13 +189641,36 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v41, v7, v0 ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_e32 v40, v6, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 ; VI-NEXT: v_or_b32_e32 v7, v25, v0 ; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v46 ; VI-NEXT: v_or_b32_e32 v6, v24, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 @@ -189475,6 +189679,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v31, v43, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 ; VI-NEXT: v_or_b32_e32 v30, v2, v0 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 ; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -189490,6 +189695,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v1 ; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v0 ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v30 @@ -189507,21 +189714,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 -; VI-NEXT: v_mov_b32_e32 v32, v10 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 -; VI-NEXT: v_mov_b32_e32 v31, v9 -; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v31, v9 +; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v11 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[40:41] -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v55, v27 ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v54, v26 ; VI-NEXT: v_mov_b32_e32 v26, v20 ; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 @@ -189529,14 +189736,23 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v5, v22 ; VI-NEXT: v_mov_b32_e32 v13, v21 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[45:46] +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52 ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[50:51] +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v48 ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[48:49] +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v36 @@ -189544,39 +189760,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36] ; VI-NEXT: v_mov_b32_e32 v36, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v15 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61 +; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v14 ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49 ; VI-NEXT: v_mov_b32_e32 v48, v56 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v33 ; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[33:34] ; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[14:15] ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v58 -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62 -; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61 -; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v57 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v23 ; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v40 -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v14, v8 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v40, v42 ; VI-NEXT: v_bfe_u32 v8, v42, 8, 8 -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v38 ; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v37 ; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[37:38] @@ -189593,24 +189797,26 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_bfe_u32 v51, v48, 8, 8 ; VI-NEXT: v_bfe_u32 v57, v7, 8, 8 ; VI-NEXT: v_bfe_u32 v58, v60, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_bfe_u32 v34, v62, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_bfe_u32 v2, v2, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v9, v9, 8, 8 ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v5, v5, 8, 8 ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v13, v13, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_bfe_u32 v2, v2, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_bfe_u32 v42, v0, 8, 8 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_bfe_u32 v34, v62, 8, 8 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload @@ -189660,25 +189866,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v57 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -189690,13 +189898,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v23 -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v12 @@ -189749,9 +189953,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v15 ; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -189760,11 +189962,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v12 @@ -189786,8 +189991,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v12 @@ -189795,9 +190003,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 ; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -189847,9 +190053,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61 ; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v12 @@ -189863,9 +190067,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189881,35 +190088,35 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v21 ; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v30 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v12 @@ -189928,10 +190135,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -189951,10 +190161,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -189964,7 +190176,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v12 ; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7c, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -189981,20 +190201,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7c, v12 -; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v64f16_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -190057,23 +190285,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 @@ -190104,6 +190315,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -190137,7 +190349,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -190260,100 +190472,101 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(62) +; GFX9-NEXT: s_waitcnt vmcnt(46) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 @@ -190369,7 +190582,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -190395,7 +190607,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] ; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -190936,7 +191148,17 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -190953,18 +191175,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -191423,11 +191633,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -191448,6 +191654,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -192083,7 +192293,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -193730,27 +193940,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v47 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s6, s7, 0xff ; SI-NEXT: s_lshl_b32 s7, s51, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v47 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s6, v1 @@ -193783,6 +193979,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s45, v62, 17 ; SI-NEXT: v_readlane_b32 s43, v62, 23 ; SI-NEXT: v_readlane_b32 s41, v62, 29 @@ -193790,7 +194001,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_readlane_b32 s27, v62, 41 ; SI-NEXT: v_readlane_b32 s25, v62, 45 ; SI-NEXT: v_readlane_b32 s9, v62, 49 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -194273,6 +194483,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v10 @@ -194280,7 +194492,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v9 ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v13 -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v12 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] @@ -194288,6 +194499,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 @@ -194295,20 +194512,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4] -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16 ; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v18 ; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v15 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v35 ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill @@ -194343,7 +194554,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_bfe_u32 v11, v52, 8, 8 ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33 ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32 -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29 ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 @@ -194953,11 +195163,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 ; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -194966,10 +195174,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -195101,22 +195312,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) @@ -195184,6 +195382,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -195501,42 +195713,42 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 @@ -196188,11 +196400,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 @@ -196224,7 +196434,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -196475,6 +196688,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 ; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -196489,8 +196704,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -196502,7 +196715,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 @@ -196537,7 +196750,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 s99, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-NEXT: s_clause 0x12 ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 @@ -197456,7 +197669,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-NEXT: s_clause 0x12 ; GFX11-NEXT: scratch_load_b32 v74, off, s32 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 @@ -197518,7 +197731,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_readlane_b32 s31, v75, 1 ; GFX11-NEXT: v_readlane_b32 s30, v75, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 @@ -197569,11 +197782,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v54, v15 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v15 ; SI-NEXT: v_mov_b32_e32 v57, v5 ; SI-NEXT: v_mov_b32_e32 v41, v3 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392 @@ -197663,30 +197876,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -197694,21 +197884,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 ; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr27 @@ -197716,211 +197913,240 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:256 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:348 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 @@ -197932,19 +198158,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill @@ -197980,7 +198202,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 @@ -198460,15 +198682,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v25, v6, v13 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v5 -; SI-NEXT: v_alignbit_b32 v7, v25, v5, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v25, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26 ; SI-NEXT: v_or_b32_e32 v6, v6, v11 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v5, v5, v8 ; SI-NEXT: s_waitcnt expcnt(0) @@ -199696,8 +199918,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -199714,12 +199942,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -199787,8 +200009,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 ; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 ; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 ; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v9 @@ -199884,25 +200106,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -200030,19 +200240,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200050,6 +200255,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:52 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -200058,57 +200283,35 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr57 -; VI-NEXT: ; implicit-def: $vgpr59 -; VI-NEXT: ; implicit-def: $vgpr47 -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr54 ; VI-NEXT: ; implicit-def: $vgpr55 ; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: ; implicit-def: $vgpr41 ; VI-NEXT: ; implicit-def: $vgpr48 ; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: ; implicit-def: $vgpr34 -; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200141,19 +200344,39 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: ; implicit-def: $vgpr53 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr43 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) @@ -200319,9 +200542,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -201006,8 +201237,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 @@ -201118,27 +201349,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 @@ -201271,19 +201488,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 -; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201291,6 +201503,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:52 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -201299,62 +201531,36 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 -; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201377,25 +201583,49 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201561,9 +201791,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 ; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 @@ -202909,7 +203147,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:580 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:576 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:572 @@ -202942,7 +203180,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v93, s32 offset:464 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v94, s32 offset:460 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v95, s32 offset:456 -; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v104, s32 offset:452 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v105, s32 offset:448 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v106, s32 offset:444 @@ -203771,7 +204009,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 ; GFX11-FAKE16-NEXT: .LBB96_4: ; %end ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1f ; GFX11-FAKE16-NEXT: scratch_load_b32 v127, off, s32 offset:392 ; GFX11-FAKE16-NEXT: scratch_load_b32 v126, off, s32 offset:396 ; GFX11-FAKE16-NEXT: scratch_load_b32 v125, off, s32 offset:400 @@ -203804,7 +204042,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:508 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:512 ; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:516 -; GFX11-FAKE16-NEXT: s_clause 0xf ; 64-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0xf ; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:520 ; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:524 ; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:528 @@ -203849,6 +204087,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 @@ -203858,9 +204097,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane -; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_writelane_b32 v41, s30, 0 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_writelane_b32 v43, s29, 0 ; SI-NEXT: v_writelane_b32 v43, s28, 1 ; SI-NEXT: v_writelane_b32 v43, s27, 2 @@ -203909,12 +204148,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s96, 32 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; SI-NEXT: v_readfirstlane_b32 s39, v26 ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s47, v12 @@ -203937,7 +204170,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s59, v28 ; SI-NEXT: v_readfirstlane_b32 s60, v27 ; SI-NEXT: v_readfirstlane_b32 s11, v1 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 @@ -203946,28 +204181,30 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 ; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v43, s4, 18 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s44, v36 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s90, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s6, v38 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v9 ; SI-NEXT: v_readfirstlane_b32 s14, v10 ; SI-NEXT: v_readfirstlane_b32 s15, v8 ; SI-NEXT: v_readfirstlane_b32 s18, v7 @@ -203981,10 +204218,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s77, v15 ; SI-NEXT: v_readfirstlane_b32 s38, v25 ; SI-NEXT: v_writelane_b32 v41, s99, 35 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s93, v55 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s95, v40 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 19 @@ -204061,35 +204294,39 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v43, s4, 30 ; SI-NEXT: v_readfirstlane_b32 s4, v32 ; SI-NEXT: v_writelane_b32 v43, s4, 31 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v43, s4, 32 -; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s9, v35 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: v_writelane_b32 v43, s4, 33 ; SI-NEXT: v_readfirstlane_b32 s10, v36 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 34 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: v_writelane_b32 v43, s4, 35 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 ; SI-NEXT: v_writelane_b32 v43, s4, 36 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s69, v48 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s30, v49 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s16, v50 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s36, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 @@ -204103,7 +204340,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; SI-NEXT: v_writelane_b32 v43, s4, 37 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v52 ; SI-NEXT: v_writelane_b32 v43, s4, 38 ; SI-NEXT: v_readfirstlane_b32 s4, v53 @@ -204130,6 +204367,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v43, s43, 58 ; SI-NEXT: v_writelane_b32 v43, s76, 59 ; SI-NEXT: v_writelane_b32 v43, s77, 60 +; SI-NEXT: v_readfirstlane_b32 s93, v55 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s95, v40 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s17, v33 ; SI-NEXT: s_waitcnt vmcnt(9) @@ -205698,53 +205938,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 ; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -205789,6 +206009,52 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 ; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 ; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill @@ -205808,7 +206074,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill @@ -205817,6 +206082,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill @@ -205848,25 +206114,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB97_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload @@ -205891,18 +206138,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, v8 -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload @@ -205952,11 +206196,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -205964,37 +206207,50 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v42, v43 ; VI-NEXT: v_mov_b32_e32 v43, v37 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload @@ -206009,12 +206265,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206036,28 +206293,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v63, v39 -; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mov_b32_e32 v57, v0 ; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -206075,10 +206325,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v53, v35 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -206111,6 +206362,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_branch .LBB97_3 ; VI-NEXT: .LBB97_2: ; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload @@ -206131,7 +206383,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v58, v7 ; VI-NEXT: v_mov_b32_e32 v57, v5 ; VI-NEXT: v_mov_b32_e32 v56, v3 @@ -206723,51 +206974,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 ; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -206831,42 +207060,82 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) ; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(28) ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) ; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) ; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -206887,13 +207156,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(55) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB97_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -207147,13 +207409,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -207163,6 +207426,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: .LBB97_2: ; GFX9-NEXT: v_mov_b32_e32 v58, v50 ; GFX9-NEXT: v_mov_b32_e32 v45, v59 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload @@ -207174,7 +207438,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v34, v35 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; GFX9-NEXT: v_mov_b32_e32 v49, v39 @@ -207640,7 +207903,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0x1e ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -208370,7 +208633,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.h, v182.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v43.l ; GFX11-TRUE16-NEXT: .LBB97_3: ; %end -; GFX11-TRUE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0x1e ; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -208412,7 +208675,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x1e ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:440 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:436 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:432 @@ -209196,7 +209459,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v30, v30, 16, v35 ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v31, v31, 16, v36 ; GFX11-FAKE16-NEXT: .LBB97_3: ; %end -; GFX11-FAKE16-NEXT: s_clause 0x1e ; 124-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x1e ; GFX11-FAKE16-NEXT: scratch_load_b32 v94, off, s32 offset:320 ; GFX11-FAKE16-NEXT: scratch_load_b32 v93, off, s32 offset:324 ; GFX11-FAKE16-NEXT: scratch_load_b32 v92, off, s32 offset:328 @@ -209299,6 +209562,100 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; kill: killed $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 @@ -209428,29 +209785,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -209467,6 +209809,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 @@ -209521,39 +209870,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr18 @@ -209563,81 +209885,36 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:116 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 -; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v60 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:84 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:56 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 @@ -209659,18 +209936,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v57 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_2 @@ -211242,25 +211507,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -211284,12 +211533,44 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64i16_to_v128i8: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 @@ -211307,22 +211588,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 ; VI-NEXT: ; kill: killed $vgpr35 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 ; VI-NEXT: ; kill: killed $vgpr35 @@ -211619,12 +211884,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v8 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v6 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 @@ -211656,6 +211923,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v3, v2 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 @@ -211726,16 +211997,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v18 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v46 ; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 ; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 ; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 @@ -211936,6 +212201,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v15 ; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v13, v41, v13 @@ -211943,35 +212211,38 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 ; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] @@ -211984,6 +212255,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 @@ -212052,7 +212325,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v49, v53 ; VI-NEXT: v_mov_b32_e32 v53, v38 ; VI-NEXT: v_mov_b32_e32 v38, v55 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 ; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill @@ -212064,13 +212336,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_mov_b32_e32 v55, v31 ; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 ; VI-NEXT: v_bfe_u32 v31, v38, 8, 8 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: .LBB98_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] @@ -212478,24 +212743,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -212518,12 +212768,44 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: bitcast_v64i16_to_v128i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 @@ -212586,23 +212868,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 @@ -212633,6 +212898,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -212666,7 +212932,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: s_waitcnt vmcnt(17) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -212789,100 +213055,101 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(62) +; GFX9-NEXT: s_waitcnt vmcnt(46) ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] ; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] ; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] ; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 @@ -212898,7 +213165,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -212923,7 +213189,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] ; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: s_waitcnt vmcnt(18) ; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -213464,7 +213730,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -213481,18 +213757,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -213951,11 +214215,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Spill +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s32 offset:88 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v41, s32 offset:84 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v42, s32 offset:80 @@ -213976,6 +214236,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v73, s32 offset:20 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v74, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-FAKE16-NEXT: s_clause 0x2 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr74 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr72 ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64 @@ -214611,7 +214875,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 ; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 ; 80-byte Folded Reload +; GFX11-FAKE16-NEXT: s_clause 0x13 ; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 ; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 ; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 @@ -214750,18 +215014,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s91, v32 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s93, v33 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s55, v34 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s17, v35 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s95, v36 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s35, v37 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 @@ -214771,6 +215023,18 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s55, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s17, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s95, v36 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s35, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s83, v38 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 ; SI-NEXT: v_readfirstlane_b32 s65, v7 @@ -214782,34 +215046,39 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s39, v1 ; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s77, v31 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s38, v32 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s48, v33 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s50, v39 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s76, v48 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s30, v49 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s34, v50 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s36, v51 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s99, v34 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s90, v35 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s92, v36 ; SI-NEXT: v_writelane_b32 v41, s90, 11 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s94, v37 ; SI-NEXT: v_writelane_b32 v41, s92, 12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s30, v49 ; SI-NEXT: v_writelane_b32 v41, s94, 13 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s34, v50 ; SI-NEXT: v_writelane_b32 v41, s30, 14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s36, v51 ; SI-NEXT: v_writelane_b32 v41, s34, 15 ; SI-NEXT: v_writelane_b32 v41, s36, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 ; SI-NEXT: v_writelane_b32 v41, s38, 17 -; SI-NEXT: v_readfirstlane_b32 s76, v48 -; SI-NEXT: v_readfirstlane_b32 s99, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_writelane_b32 v41, s48, 18 ; SI-NEXT: v_writelane_b32 v41, s50, 19 @@ -217791,48 +218060,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 ; GFX9-NEXT: v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 ; GFX9-NEXT: v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 ; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 ; GFX9-NEXT: v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v4 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v3 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v5 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 24, v8 -; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[21:22] ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v8 ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v8 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v26 @@ -218484,11 +218753,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 @@ -218520,7 +218787,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v18 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -218771,6 +219041,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v24 ; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -218785,8 +219057,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload @@ -218798,7 +219068,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:76 ; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:80 ; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:84 @@ -218833,7 +219103,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_writelane_b32 v76, s101, 5 ; GFX11-NEXT: s_mov_b32 s99, 0 ; GFX11-NEXT: s_and_b32 s42, vcc_lo, exec_lo -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Spill +; GFX11-NEXT: s_clause 0x12 ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:72 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:68 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:64 @@ -219752,7 +220022,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: scratch_store_b128 v0, v[11:14], off offset:80 ; GFX11-NEXT: scratch_store_b128 v0, v[7:10], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 -; GFX11-NEXT: s_clause 0x12 ; 76-byte Folded Reload +; GFX11-NEXT: s_clause 0x12 ; GFX11-NEXT: scratch_load_b32 v74, off, s32 ; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:4 ; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:8 @@ -219814,7 +220084,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s31, v75, 1 ; GFX11-NEXT: v_readlane_b32 s30, v75, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:76 ; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:80 ; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:84 @@ -220832,24 +221102,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -221194,14 +221449,28 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -221218,7 +221487,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -221467,6 +221738,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -221832,9 +222104,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-LABEL: bitcast_v64bf16_to_v64f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -221851,7 +222120,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -222070,7 +222341,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -222370,7 +222641,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Spill +; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:68 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:60 @@ -222930,7 +223201,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v10, v42 :: v_dual_mov_b32 v11, v43 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v12, v44 :: v_dual_mov_b32 v13, v45 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v14, v46 :: v_dual_mov_b32 v15, v47 -; GFX11-TRUE16-NEXT: s_clause 0xf ; 64-byte Folded Reload +; GFX11-TRUE16-NEXT: s_clause 0xf ; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:16 @@ -224659,26 +224930,10 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -224711,6 +224966,22 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -228691,6 +228962,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -228707,7 +228979,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -230050,7 +230321,20 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v37 ; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload @@ -230067,19 +230351,6 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -231127,32 +231398,17 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v13 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_alignbit_b32 v5, v23, v5, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v21, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v61, v1, 16 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 @@ -231162,63 +231418,57 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v7, v24, v7, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v25, v45, v8, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v9 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v8, v25, v8, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v62, v63, v16, 16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 ; SI-NEXT: v_alignbit_b32 v16, v62, v16, 16 ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v22, v34, v9, 16 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v11 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v9, v22, v9, 16 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 @@ -231276,8 +231526,31 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v15, v41, v15, 16 ; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v20, v6, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v19, v3, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v1, v18, v4, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: .LBB104_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) @@ -231535,23 +231808,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 @@ -231576,14 +231833,28 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -231600,7 +231871,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -231849,6 +232122,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -232214,9 +232488,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-LABEL: bitcast_v64bf16_to_v64i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -232233,7 +232504,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -232452,7 +232725,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 ; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 ; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 ; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 @@ -234057,21 +234330,15 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v57, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_mov_b32_e32 v57, v13 ; SI-NEXT: v_mov_b32_e32 v40, v3 ; SI-NEXT: v_mov_b32_e32 v54, v50 ; SI-NEXT: v_mov_b32_e32 v46, v19 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 ; SI-NEXT: v_mov_b32_e32 v44, v15 ; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 @@ -234105,24 +234372,32 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_mov_b32_e32 v42, v43 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(1) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v50 -; SI-NEXT: v_mov_b32_e32 v5, v19 -; SI-NEXT: v_mov_b32_e32 v7, v15 +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_mov_b32_e32 v5, v19 +; SI-NEXT: v_mov_b32_e32 v7, v15 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) @@ -234258,7 +234533,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v54, v50 +; SI-NEXT: v_mov_b32_e32 v56, v47 ; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v53, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload @@ -234266,8 +234543,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v40, v3 ; SI-NEXT: v_mov_b32_e32 v44, v15 ; SI-NEXT: v_mov_b32_e32 v57, v13 @@ -234575,18 +234850,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: v_lshr_b64 v[51:52], v[25:26], 16 ; SI-NEXT: v_lshr_b64 v[52:53], v[1:2], 16 -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v45, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v28, v58, v27, 16 @@ -234644,16 +234917,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 ; SI-NEXT: .LBB105_5: ; %end -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -234679,8 +234955,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -234707,11 +234985,12 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -234755,25 +235034,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -234788,10 +235049,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 @@ -234806,8 +235067,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 @@ -234815,18 +235078,17 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 @@ -234834,18 +235096,17 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 @@ -234924,6 +235185,22 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -238138,23 +238415,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 @@ -238319,6 +238580,22 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -239903,39 +240180,38 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 @@ -239946,6 +240222,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 @@ -240527,6 +240804,16 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -240543,16 +240830,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -241023,12 +241300,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -241040,24 +241315,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 -; SI-NEXT: v_mov_b32_e32 v54, v15 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 -; SI-NEXT: v_mov_b32_e32 v12, v42 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -241067,13 +241325,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v26, v3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -241082,22 +241335,39 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v22, v3, v5 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v18, v3, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v54, v15 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_mov_b32_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -241115,6 +241385,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v14, v3, v5 @@ -241158,6 +241430,11 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload @@ -241294,27 +241571,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v50, v1 ; SI-NEXT: v_lshr_b64 v[49:50], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v35, v44 +; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16 ; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16 ; SI-NEXT: v_lshr_b64 v[20:21], v[42:43], 16 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[9:10], 16 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v35, v44 -; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16 -; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[40:41], 16 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 ; SI-NEXT: v_mov_b32_e32 v42, v61 ; SI-NEXT: v_mov_b32_e32 v61, v37 @@ -241408,18 +241685,17 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -241451,9 +241727,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 @@ -241468,10 +241742,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 @@ -241484,24 +241758,25 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 @@ -241530,25 +241805,9 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -241566,6 +241825,22 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -242603,24 +242878,9 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -242965,6 +243225,22 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -244047,8 +244323,15 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload @@ -244065,13 +244348,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index e688681c5ad09..9041f64cb17fb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -15670,25 +15670,8 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v3, 0xff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -15698,6 +15681,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v40 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -15727,6 +15711,22 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15947,16 +15947,16 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 -; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -17964,6 +17964,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v20i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v34, v10 ; VI-NEXT: v_mov_b32_e32 v33, v8 ; VI-NEXT: v_mov_b32_e32 v35, v6 @@ -17980,14 +17988,6 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 @@ -18005,15 +18005,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -18044,7 +18046,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -18099,14 +18101,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 ; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v0, 3, v54 ; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v0, 3, v53 ; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v51 ; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v30 @@ -21932,14 +21934,6 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-NEXT: v_bfe_u32 v29, v1, 8, 8 ; SI-NEXT: .LBB60_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 @@ -22058,6 +22052,14 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -23916,6 +23918,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v40i8_to_v20f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v36, v4 ; SI-NEXT: v_mov_b32_e32 v31, v2 ; SI-NEXT: v_mov_b32_e32 v35, v0 @@ -23929,18 +23943,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v5 @@ -23972,16 +23974,20 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(4) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v0 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v4 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v32 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v34 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -24021,7 +24027,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v6, 0xff, v30 ; SI-NEXT: v_or_b32_e32 v6, v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v50 ; SI-NEXT: v_or_b32_e32 v6, v6, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v6 @@ -24099,17 +24105,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x300, v0 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v0 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_or_b32_e32 v0, v56, v0 @@ -24225,6 +24232,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v20f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v34, v10 ; VI-NEXT: v_mov_b32_e32 v33, v8 ; VI-NEXT: v_mov_b32_e32 v35, v6 @@ -24241,14 +24256,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 @@ -24266,15 +24273,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -24305,7 +24314,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -24360,14 +24369,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v1, 0x300 ; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u16_e32 v0, 3, v54 ; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u16_e32 v0, 3, v53 ; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 3, v51 ; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 3, v30 @@ -28243,6 +28252,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v40i8_to_v5f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v36, v10 ; SI-NEXT: v_mov_b32_e32 v35, v8 ; SI-NEXT: v_mov_b32_e32 v34, v6 @@ -28259,15 +28277,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: s_waitcnt expcnt(0) @@ -28286,14 +28295,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28356,7 +28368,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v8, v8, v23 @@ -28496,7 +28508,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 @@ -28545,6 +28557,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v5f64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v36, v10 ; VI-NEXT: v_mov_b32_e32 v35, v8 ; VI-NEXT: v_mov_b32_e32 v34, v6 @@ -28561,15 +28582,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v38, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -28587,14 +28599,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28625,7 +28640,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28733,7 +28748,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v8, 3, v50 ; VI-NEXT: v_add_u16_e32 v10, 3, v49 ; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -28765,6 +28780,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v40i8_to_v5f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v36, v10 ; GFX9-NEXT: v_mov_b32_e32 v35, v8 ; GFX9-NEXT: v_mov_b32_e32 v34, v6 @@ -28781,16 +28805,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v38, v14 ; GFX9-NEXT: v_mov_b32_e32 v37, v12 ; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -28808,17 +28822,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28849,7 +28863,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -28957,7 +28971,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v50 ; GFX9-NEXT: v_add_u16_e32 v9, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -32287,6 +32301,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-LABEL: bitcast_v40i8_to_v5i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v36, v10 ; SI-NEXT: v_mov_b32_e32 v35, v8 ; SI-NEXT: v_mov_b32_e32 v34, v6 @@ -32303,15 +32326,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: s_waitcnt expcnt(0) @@ -32330,14 +32344,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v52, 8, v25 ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v0 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v4 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v8 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v10 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -32400,7 +32417,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v8, v8, v23 @@ -32540,7 +32557,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v25, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v49 @@ -32589,6 +32606,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-LABEL: bitcast_v40i8_to_v5i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v36, v10 ; VI-NEXT: v_mov_b32_e32 v35, v8 ; VI-NEXT: v_mov_b32_e32 v34, v6 @@ -32605,15 +32631,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v38, v14 ; VI-NEXT: v_mov_b32_e32 v37, v12 ; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -32631,14 +32648,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -32669,7 +32689,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -32777,7 +32797,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 ; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v7, v7, v8 -; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v8, 3, v50 ; VI-NEXT: v_add_u16_e32 v10, 3, v49 ; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -32809,6 +32829,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-LABEL: bitcast_v40i8_to_v5i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v36, v10 ; GFX9-NEXT: v_mov_b32_e32 v35, v8 ; GFX9-NEXT: v_mov_b32_e32 v34, v6 @@ -32825,16 +32854,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 ; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 ; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v38, v14 ; GFX9-NEXT: v_mov_b32_e32 v37, v12 ; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1 @@ -32852,17 +32871,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25 ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v6 -; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v8 -; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v10 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -32893,7 +32912,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -33001,7 +33020,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 ; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v8, 3, v50 ; GFX9-NEXT: v_add_u16_e32 v9, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 39da45b3e5063..ee23420c2a662 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -2406,13 +2406,13 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -2435,9 +2435,9 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -8424,22 +8424,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -8475,6 +8459,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8757,22 +8757,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -8806,6 +8790,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -9077,25 +9077,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -9122,6 +9106,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -11440,6 +11440,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -11448,11 +11453,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -11484,6 +11484,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -11722,6 +11723,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -11970,11 +11972,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -12014,9 +12016,16 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -12026,13 +12035,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12042,10 +12044,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12208,7 +12211,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12218,6 +12221,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12424,11 +12428,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -12472,9 +12476,16 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -12484,13 +12495,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -12500,10 +12504,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -12666,7 +12671,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -12676,6 +12681,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -17317,13 +17323,13 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -17346,9 +17352,9 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -23309,22 +23315,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -23360,6 +23350,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -23642,22 +23648,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -23691,6 +23681,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -23962,25 +23968,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -24007,6 +23997,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -25434,21 +25440,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s67, v63, 19 ; VI-NEXT: v_readlane_b32 s66, v63, 18 ; VI-NEXT: v_readlane_b32 s65, v63, 17 @@ -25469,7 +25460,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; VI-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25499,6 +25490,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] @@ -25867,21 +25873,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -25898,7 +25889,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -25924,6 +25915,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -26446,6 +26452,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -26454,11 +26465,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -26490,6 +26496,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -26728,6 +26735,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -26976,11 +26984,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -27020,9 +27028,16 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -27032,13 +27047,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27048,10 +27056,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -27214,7 +27223,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -27224,6 +27233,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -27430,11 +27440,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -27478,9 +27488,16 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -27490,13 +27507,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -27506,10 +27516,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -27672,7 +27683,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -27682,6 +27693,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -31676,13 +31688,13 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v8i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -31705,9 +31717,9 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -37702,22 +37714,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -37753,6 +37749,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -38035,22 +38047,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -38084,6 +38080,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -38355,25 +38367,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -38400,6 +38396,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -40728,6 +40740,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -40736,11 +40753,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -40772,6 +40784,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -41010,6 +41023,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -41258,11 +41272,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -41302,9 +41316,16 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -41314,13 +41335,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -41330,10 +41344,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -41496,7 +41511,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -41506,6 +41521,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -41712,11 +41728,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -41760,9 +41776,16 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -41772,13 +41795,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -41788,10 +41804,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -41954,7 +41971,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -41964,6 +41981,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -45299,13 +45317,13 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v32i16_to_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v38, v14 ; SI-NEXT: v_mov_b32_e32 v37, v12 ; SI-NEXT: v_mov_b32_e32 v36, v10 @@ -45328,9 +45346,9 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -51147,22 +51165,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -51198,6 +51200,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -51472,22 +51490,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -51521,6 +51523,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -51784,25 +51802,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -51829,6 +51831,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -54170,6 +54188,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:92 @@ -54178,11 +54201,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -54214,6 +54232,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v12, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v54, v12 @@ -54452,6 +54471,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v43, v11 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 @@ -54700,11 +54720,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -54744,9 +54764,16 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(9) ; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -54756,13 +54783,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -54772,10 +54792,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -54938,7 +54959,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: v_mov_b32_e32 v15, 0x300 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_add_u16_e32 v9, 3, v40 ; VI-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -54948,6 +54969,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v11, 3, v23 ; VI-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u16_e32 v12, 3, v38 ; VI-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -55154,11 +55176,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:128 -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 ; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 ; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v27 ; GFX9-NEXT: v_lshlrev_b16_e32 v63, 8, v29 ; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 ; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 @@ -55202,9 +55224,16 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v53 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v40 -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(9) ; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v45 ; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:100 @@ -55214,13 +55243,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:68 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 ; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -55230,10 +55252,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -55396,7 +55419,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_movk_i32 s6, 0x300 -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_add_u16_e32 v9, 3, v40 ; GFX9-NEXT: v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -55406,6 +55429,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX9-NEXT: v_add_u16_e32 v11, 3, v23 ; GFX9-NEXT: v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_add_u16_e32 v12, 3, v38 ; GFX9-NEXT: v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -60556,8 +60580,6 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v32i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -60574,6 +60596,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v63, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v62, 1.0, v1 @@ -60637,8 +60661,9 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -64301,6 +64326,18 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v62 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -64317,18 +64354,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -64446,44 +64471,44 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v14 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v12 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v12 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11 ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v10 -; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v8 -; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8 -; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v7 -; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v6 -; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16] ; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14] +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v16 ; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9 ; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v4 ; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4 ; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 @@ -64780,6 +64805,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -64796,8 +64823,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -65069,25 +65094,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -65114,6 +65123,22 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -67218,21 +67243,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -67249,7 +67259,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -67275,6 +67285,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -67743,61 +67768,17 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v21 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v19 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v27 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr55 @@ -67812,24 +67793,25 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v32 ; SI-NEXT: v_lshlrev_b32_e32 v44, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v34 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v35 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v36 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v37 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v38 ; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v25 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v39 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v48 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 @@ -67837,6 +67819,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 @@ -67850,8 +67833,57 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v29 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -67860,6 +67892,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v58 ; SI-NEXT: v_or_b32_e32 v21, v21, v26 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 @@ -68140,6 +68173,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB98_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -68164,6 +68198,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v3, v59, v3 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v55, vcc, s7, v1 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v8 @@ -68187,6 +68222,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v54, vcc, s7, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -68394,6 +68430,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: .LBB98_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -68410,8 +68448,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, v37 ; SI-NEXT: v_mov_b32_e32 v2, v48 @@ -68422,6 +68458,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v12, v32 ; SI-NEXT: v_mov_b32_e32 v14, v51 ; SI-NEXT: v_mov_b32_e32 v16, v34 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v18, v52 ; SI-NEXT: v_mov_b32_e32 v20, v36 ; SI-NEXT: v_mov_b32_e32 v22, v53 @@ -70159,12 +70196,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v46, v30 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 @@ -70181,7 +70219,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: v_readfirstlane_b32 s43, v1 ; SI-NEXT: v_readfirstlane_b32 s42, v0 ; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v3 @@ -70205,19 +70242,19 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v48 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v37 ; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v37 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v31 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v34 ; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill @@ -70243,7 +70280,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v60, v44 ; SI-NEXT: v_or_b32_e32 v44, v53, v9 ; SI-NEXT: v_or_b32_e32 v33, v1, v44 @@ -70688,6 +70725,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -70704,12 +70747,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_waitcnt expcnt(0) @@ -70721,13 +70758,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v10, v38 ; SI-NEXT: v_mov_b32_e32 v12, v33 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v14, v34 ; SI-NEXT: v_mov_b32_e32 v16, v48 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v18, v49 ; SI-NEXT: v_mov_b32_e32 v20, v35 ; SI-NEXT: v_mov_b32_e32 v22, v36 @@ -70735,6 +70770,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v26, v51 ; SI-NEXT: v_mov_b32_e32 v28, v54 ; SI-NEXT: v_mov_b32_e32 v30, v55 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: v_mov_b32_e32 v39, v32 @@ -72152,8 +72188,6 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v32f16_to_v32bf16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill @@ -72170,6 +72204,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 @@ -72237,8 +72273,9 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v31 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -76957,24 +76994,8 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -77002,6 +77023,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -77314,23 +77351,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 ; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -77354,6 +77375,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -77626,25 +77663,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -77671,6 +77692,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -79126,12 +79163,13 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: s_branch .LBB105_2 ; VI-NEXT: .LBB105_4: +; VI-NEXT: v_mov_b32_e32 v1, s58 ; VI-NEXT: v_mov_b32_e32 v53, s56 ; VI-NEXT: v_mov_b32_e32 v52, s42 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v52, s44 -; VI-NEXT: v_mov_b32_e32 v1, s58 ; VI-NEXT: v_mov_b32_e32 v19, s67 ; VI-NEXT: v_mov_b32_e32 v12, s66 ; VI-NEXT: v_mov_b32_e32 v20, s65 @@ -79177,7 +79215,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v45, s78 ; VI-NEXT: v_mov_b32_e32 v42, s76 ; VI-NEXT: v_mov_b32_e32 v55, s74 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v54, s57 ; VI-NEXT: v_mov_b32_e32 v41, s59 ; VI-NEXT: v_mov_b32_e32 v44, s60 @@ -79283,21 +79320,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v54 ; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_readlane_b32 s67, v63, 19 ; VI-NEXT: v_readlane_b32 s66, v63, 18 ; VI-NEXT: v_readlane_b32 s65, v63, 17 @@ -79318,7 +79340,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -79350,6 +79372,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] @@ -79719,21 +79756,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s55, v63, 15 ; GFX9-NEXT: v_readlane_b32 s54, v63, 14 ; GFX9-NEXT: v_readlane_b32 s53, v63, 13 @@ -79750,7 +79772,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_readlane_b32 s34, v63, 2 ; GFX9-NEXT: v_readlane_b32 s31, v63, 1 ; GFX9-NEXT: v_readlane_b32 s30, v63, 0 -; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -79776,6 +79798,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; GFX9-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -80249,14 +80286,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 @@ -80331,10 +80360,19 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v31 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v32 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v36 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v37 @@ -80352,7 +80390,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v16, v19, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 @@ -80365,6 +80403,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v12, 0xff, v18 ; SI-NEXT: v_or_b32_e32 v10, v12, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v10 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v10, 0xff, v41 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 @@ -80389,7 +80428,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v0 -; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v0 @@ -80596,12 +80634,13 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB106_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v56 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_or_b32_e32 v7, v3, v7 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 ; SI-NEXT: v_or_b32_e32 v6, v46, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 @@ -80609,10 +80648,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v9, v35, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v39, v7 ; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v7 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 @@ -80811,6 +80852,13 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 ; SI-NEXT: .LBB106_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload @@ -80827,21 +80875,14 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v8, v33 ; SI-NEXT: v_mov_b32_e32 v10, v37 ; SI-NEXT: v_mov_b32_e32 v12, v49 ; SI-NEXT: v_mov_b32_e32 v14, v53 ; SI-NEXT: v_mov_b32_e32 v16, v32 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v18, v34 ; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_mov_b32_e32 v22, v38 ; SI-NEXT: v_mov_b32_e32 v24, v48 ; SI-NEXT: v_mov_b32_e32 v26, v50 @@ -84420,6 +84461,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-LABEL: bitcast_v32bf16_to_v64i8: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 @@ -84485,22 +84542,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v35, 1.0, v3 @@ -84564,9 +84605,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v37 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v48 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v50 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr50 @@ -85177,24 +85220,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -85222,6 +85249,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -85777,22 +85820,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 @@ -85826,6 +85853,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -86357,10 +86400,21 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -86377,17 +86431,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -88002,26 +88045,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v54 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_lshl_b32 s4, s76, 8 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_and_b32 s4, s74, 0xff ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -88049,6 +88076,22 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: @@ -88762,21 +88805,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_readlane_b32 s67, v63, 19 @@ -88799,7 +88827,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_readlane_b32 s34, v63, 2 ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -88829,6 +88857,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: s_or_saveexec_b64 s[4:5], -1 ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[4:5] @@ -90386,8 +90429,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v13 ; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v21 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v27 @@ -90417,30 +90458,28 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v12 ; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v17 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v20 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v28 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v31 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v32 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v33 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v34 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v36 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt expcnt(0) @@ -90457,6 +90496,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 @@ -90472,8 +90513,16 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v19 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v23 ; SI-NEXT: ; kill: killed $vgpr3 @@ -90754,6 +90803,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB110_4 ; SI-NEXT: ; %bb.3: ; %cmp.true +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 @@ -90779,6 +90829,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v58, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v26 @@ -90790,6 +90841,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v46, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v25, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v59 @@ -90802,6 +90854,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v5, v12, v5 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v12, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v20 @@ -90815,6 +90868,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v28 ; SI-NEXT: v_or_b32_e32 v3, v5, v3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 @@ -91032,8 +91086,11 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 ; SI-NEXT: .LBB110_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v2, v43 ; SI-NEXT: v_mov_b32_e32 v10, v41 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_mov_b32_e32 v28, v40 ; SI-NEXT: v_mov_b32_e32 v30, v42 ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload @@ -91052,8 +91109,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v4, v33 ; SI-NEXT: v_mov_b32_e32 v6, v39 ; SI-NEXT: v_mov_b32_e32 v8, v51 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 46911e7934429..5d4df4bde1af8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -4938,13 +4938,6 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 @@ -5044,6 +5037,13 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -6164,14 +6164,6 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -6188,28 +6180,36 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -6224,12 +6224,14 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12069,13 +12071,6 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 @@ -12175,6 +12170,13 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -13433,14 +13435,6 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v18f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -13457,28 +13451,36 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -13493,12 +13495,14 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -18416,13 +18420,6 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 @@ -18522,6 +18519,13 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -19652,14 +19656,6 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -19676,28 +19672,36 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -19712,12 +19716,14 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -23972,13 +23978,6 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 @@ -24078,6 +24077,13 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -25276,14 +25282,6 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v36f16_to_v9f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill @@ -25300,28 +25298,36 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:20 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v47, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v46, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v14 @@ -25336,12 +25342,14 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v51, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v27 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(9) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v36 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -26790,6 +26798,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v36i16_to_v36f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12 @@ -26814,22 +26838,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -26857,7 +26865,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; kill: killed $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -26884,7 +26892,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v47, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v12 @@ -26969,6 +26977,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -27138,24 +27147,8 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -27210,6 +27203,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -27751,17 +27760,6 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -27844,6 +27842,17 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: @@ -28700,12 +28709,6 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v55 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 @@ -28807,6 +28810,12 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -29412,15 +29421,9 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -29490,6 +29493,12 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 6749daba296c5..44cfd6c28ca6a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -3541,17 +3541,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v20i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -3573,6 +3562,17 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -3594,10 +3594,13 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -5554,23 +5557,10 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -5666,6 +5656,19 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -11737,17 +11740,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v20f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -11769,6 +11761,17 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -11790,10 +11793,13 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -13750,23 +13756,10 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -13862,6 +13855,19 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -14429,10 +14435,6 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 @@ -14571,6 +14573,9 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -19244,17 +19249,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v10i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -19276,6 +19270,17 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -19297,10 +19302,13 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -21257,23 +21265,10 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -21369,6 +21364,19 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -25980,17 +25988,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v10f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill @@ -26012,6 +26009,17 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 ; SI-NEXT: v_mov_b32_e32 v37, v20 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_mov_b32_e32 v39, v16 @@ -26033,10 +26041,13 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 @@ -27973,23 +27984,10 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -28085,6 +28083,19 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -28624,11 +28635,6 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 @@ -28767,6 +28773,10 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -31379,17 +31389,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v40i16_to_v40f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill @@ -31406,6 +31405,17 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 ; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; kill: killed $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr40 @@ -31462,7 +31472,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; kill: killed $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -31513,6 +31523,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f32_f16_e32 v40, v48 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -31612,6 +31623,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 ; SI-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v49 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -31631,6 +31643,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v48, vcc, 3, v48 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill @@ -31837,23 +31850,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -31891,6 +31888,22 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -32586,6 +32599,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -32602,11 +32620,6 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: @@ -33561,20 +33574,8 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v19, v5, v29, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -33689,6 +33690,18 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -34404,18 +34417,6 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -34450,6 +34451,18 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 6b13e96d73999..87d5157b3c340 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -3792,17 +3792,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v22i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -3825,6 +3814,17 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -3842,8 +3842,9 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -6117,24 +6118,8 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -6156,6 +6141,22 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -12754,17 +12755,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v22f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -12787,6 +12777,17 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -12804,8 +12805,9 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -15079,24 +15081,8 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -15118,6 +15104,22 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15742,15 +15744,6 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: .LBB33_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 @@ -15903,6 +15896,14 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -21003,17 +21004,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v11i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -21036,6 +21026,17 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -21053,8 +21054,9 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -23328,24 +23330,8 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -23367,6 +23353,22 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -28418,17 +28420,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v44i16_to_v11f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill @@ -28451,6 +28442,17 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v39, v16 +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_mov_b32_e32 v38, v18 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 @@ -28468,8 +28470,9 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 @@ -30721,24 +30724,8 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -30760,6 +30747,22 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -31352,16 +31355,6 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v50, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 @@ -31514,6 +31507,15 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -34942,23 +34944,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -34996,6 +34982,22 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -35797,6 +35799,11 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -35813,11 +35820,6 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: @@ -36877,19 +36879,9 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v26, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 ; SI-NEXT: v_or_b32_e32 v26, v26, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen @@ -37002,6 +36994,16 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -37791,22 +37793,6 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 @@ -37841,6 +37827,22 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 034b8027851f4..fb2e94fc3b87a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -4045,22 +4045,6 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v24i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -4085,6 +4069,22 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -4100,14 +4100,21 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -6615,24 +6622,8 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -6708,6 +6699,22 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8172,8 +8179,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -8190,6 +8195,8 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -8216,34 +8223,34 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -13036,9 +13043,6 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen @@ -13150,6 +13154,9 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: @@ -13875,22 +13882,6 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v24f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -13915,6 +13906,22 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -13930,14 +13937,21 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -16445,24 +16459,8 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -16538,6 +16536,22 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -17219,27 +17233,13 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_or_b32_e32 v55, v55, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 @@ -17395,6 +17395,19 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -18144,8 +18157,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v24f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -18162,6 +18173,8 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -18188,34 +18201,34 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -22969,22 +22982,6 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v12i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -23009,6 +23006,22 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -23024,14 +23037,21 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -25539,24 +25559,8 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -25632,6 +25636,22 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -27108,8 +27128,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -27126,6 +27144,8 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -27152,34 +27172,34 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -30364,9 +30384,6 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen @@ -30478,6 +30495,9 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: @@ -31179,22 +31199,6 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v48i16_to_v12f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -31219,6 +31223,22 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v48, v14 +; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v5 @@ -31234,14 +31254,21 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 @@ -33725,24 +33752,8 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -33818,6 +33829,22 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -34464,28 +34491,13 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_or_b32_e32 v55, v55, v40 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_or_b32_e32 v55, v55, v40 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 @@ -34641,6 +34653,20 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -35366,8 +35392,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-LABEL: bitcast_v48f16_to_v12f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill @@ -35384,6 +35408,8 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v54, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 @@ -35410,34 +35436,34 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 @@ -38310,24 +38336,8 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -38514,6 +38524,22 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -39425,8 +39451,14 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -39443,12 +39475,6 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: @@ -40728,23 +40754,6 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v56 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v23 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 @@ -40758,6 +40767,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v59 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 @@ -40765,6 +40775,22 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -41229,11 +41255,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill @@ -41250,6 +41271,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v61, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 @@ -41294,12 +41320,16 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v50, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v25, v35 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec @@ -41664,6 +41694,9 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload @@ -41680,9 +41713,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 8b6210d6a817a..07cdbef82d892 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -2741,14 +2741,9 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -2860,6 +2855,11 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4341,19 +4341,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v26i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -4379,6 +4366,19 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -4394,12 +4394,17 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -4424,10 +4429,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -4439,9 +4443,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -5027,6 +5032,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -5093,7 +5099,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -5226,9 +5231,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -5243,6 +5245,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -5261,10 +5266,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -5293,6 +5294,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -7258,6 +7263,11 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -7274,11 +7284,6 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8046,34 +8051,29 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_or_b32_e32 v43, v43, v44 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v41, v41, v42 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v55, v55, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 ; SI-NEXT: v_or_b32_e32 v53, v53, v54 @@ -8225,6 +8225,11 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: @@ -9755,6 +9760,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -9821,7 +9827,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -9954,9 +9959,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -9971,6 +9973,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -9990,10 +9995,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -10022,6 +10023,10 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -10290,28 +10295,14 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -10327,6 +10318,22 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -10335,8 +10342,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -10356,8 +10363,10 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -10398,11 +10407,11 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -10416,6 +10425,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -10453,6 +10463,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -13345,14 +13356,9 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -13464,6 +13470,11 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -14198,14 +14209,6 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14269,6 +14272,14 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: @@ -15065,19 +15076,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v26f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -15103,6 +15101,19 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -15118,12 +15129,17 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -15148,10 +15164,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -15163,9 +15178,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -15751,6 +15767,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -15817,7 +15834,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -15950,9 +15966,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -15967,6 +15980,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -15985,10 +16001,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -16017,6 +16029,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -17982,6 +17998,11 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -17998,11 +18019,6 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -18922,9 +18938,14 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -18941,11 +18962,6 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -20637,6 +20653,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -20703,7 +20720,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -20836,9 +20852,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -20853,6 +20866,9 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -20872,10 +20888,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -20904,6 +20916,10 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -21172,28 +21188,14 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -21209,6 +21211,22 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -21217,8 +21235,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -21238,8 +21256,10 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -21280,11 +21300,11 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -21298,6 +21318,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -21335,6 +21356,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a, ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -23387,14 +23409,9 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -23506,6 +23523,11 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -25001,19 +25023,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v13i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -25039,6 +25048,19 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -25054,12 +25076,17 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -25084,10 +25111,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -25099,9 +25125,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -25687,6 +25714,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -25753,7 +25781,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -25886,9 +25913,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -25903,6 +25927,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -25921,10 +25948,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -25953,6 +25976,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -27919,6 +27946,11 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -27935,11 +27967,6 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -28721,34 +28748,29 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_or_b32_e32 v43, v43, v44 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v43, vcc, 4, v0 ; SI-NEXT: v_or_b32_e32 v41, v41, v42 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v41, vcc, 8, v0 ; SI-NEXT: v_or_b32_e32 v55, v55, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: buffer_store_dword v55, v41, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v55, vcc, 12, v0 ; SI-NEXT: v_or_b32_e32 v53, v53, v54 @@ -28900,6 +28922,11 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: @@ -30430,6 +30457,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -30496,7 +30524,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -30629,9 +30656,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -30646,6 +30670,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -30665,10 +30692,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -30697,6 +30720,10 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -30965,28 +30992,14 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -31002,6 +31015,22 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -31010,8 +31039,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -31031,8 +31060,10 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -31073,11 +31104,11 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -31091,6 +31122,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -31128,6 +31160,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -32365,14 +32398,9 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -32484,6 +32512,11 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -33179,14 +33212,6 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -33250,6 +33275,14 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: @@ -34020,19 +34053,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v52i16_to_v13f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 -; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -34058,6 +34078,19 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v50, v10 +; SI-NEXT: v_mov_b32_e32 v51, v8 +; SI-NEXT: v_mov_b32_e32 v52, v6 +; SI-NEXT: v_mov_b32_e32 v53, v4 +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v49, v12 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 @@ -34073,12 +34106,17 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v25 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:8 @@ -34103,10 +34141,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 @@ -34118,9 +34155,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 @@ -34706,6 +34744,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -34772,7 +34811,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -34905,9 +34943,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -34922,6 +34957,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 ; GFX9-NEXT: v_perm_b32 v2, v33, v47, s6 @@ -34940,10 +34978,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -34972,6 +35006,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -36911,6 +36949,11 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -36927,11 +36970,6 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -37812,9 +37850,14 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -37831,11 +37874,6 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -39501,6 +39539,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v57, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v24 @@ -39567,7 +39606,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v56 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v57 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -39700,9 +39738,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -39717,6 +39752,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v34, v57, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v62, v56, s6 @@ -39736,10 +39774,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v40, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -39768,6 +39802,10 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v36, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v35, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -40036,28 +40074,14 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill @@ -40073,6 +40097,22 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload @@ -40081,8 +40121,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 @@ -40102,8 +40142,10 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v62 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 @@ -40144,11 +40186,11 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v25, v38, v25 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 @@ -40162,6 +40204,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 @@ -40199,6 +40242,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -41826,24 +41870,8 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -42074,6 +42102,22 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -43294,7 +43338,23 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -43311,22 +43371,6 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -44640,25 +44684,9 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen @@ -44686,6 +44714,22 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -45204,15 +45248,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill @@ -45229,6 +45264,15 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 @@ -45273,19 +45317,26 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v41, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v54, s29 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v28 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v55, v36 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v27, v39 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 @@ -45678,25 +45729,9 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -45724,6 +45759,22 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 09cf27810a5c9..8eb71e90f8504 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -2928,18 +2928,9 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -3039,6 +3030,15 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4665,11 +4665,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v28i32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -4699,6 +4694,11 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -4715,8 +4715,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -5412,6 +5413,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -5484,7 +5486,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -5633,9 +5634,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -5650,6 +5648,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -5668,10 +5669,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -5700,6 +5697,10 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -7829,7 +7830,21 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -7846,20 +7861,6 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8742,15 +8743,6 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0 @@ -8896,6 +8888,15 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: @@ -10559,6 +10560,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -10631,7 +10633,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -10780,9 +10781,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -10797,6 +10795,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -10816,10 +10817,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -10848,6 +10845,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -11147,20 +11148,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -11168,6 +11156,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -11199,6 +11188,19 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -11215,11 +11217,11 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -11297,7 +11299,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB19_3 ; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -11316,6 +11317,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -11583,6 +11585,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -14431,18 +14434,9 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14542,6 +14536,15 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -15379,21 +15382,9 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15421,6 +15412,18 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: @@ -16287,11 +16290,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v28f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -16321,6 +16319,11 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -16337,8 +16340,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -17034,6 +17038,7 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -17106,7 +17111,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -17255,9 +17259,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -17272,6 +17273,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -17290,10 +17294,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -17322,6 +17322,10 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -19451,7 +19455,21 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -19468,20 +19486,6 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -20469,12 +20473,28 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -20491,22 +20511,6 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -22339,6 +22343,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -22411,7 +22416,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -22560,9 +22564,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -22577,6 +22578,9 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -22596,10 +22600,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -22628,6 +22628,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -22927,20 +22931,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -22948,6 +22939,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -22979,6 +22971,19 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -22995,11 +23000,11 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -23077,7 +23082,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB35_3 ; SI-NEXT: .LBB35_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -23096,6 +23100,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -23363,6 +23368,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a, ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -25323,18 +25329,9 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -25434,6 +25431,15 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -27074,11 +27080,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v14i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -27108,6 +27109,11 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -27124,8 +27130,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -27821,6 +27828,7 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -27893,7 +27901,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -28042,9 +28049,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -28059,6 +28063,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -28077,10 +28084,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -28109,6 +28112,10 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -30238,7 +30245,21 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -30255,20 +30276,6 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -31165,15 +31172,6 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v53, v55, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v53, vcc, 24, v0 @@ -31319,6 +31317,15 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: @@ -32982,6 +32989,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -33054,7 +33062,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -33203,9 +33210,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -33220,6 +33224,9 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -33239,10 +33246,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -33271,6 +33274,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -33570,20 +33577,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -33591,6 +33585,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -33622,6 +33617,19 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -33638,11 +33646,11 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -33720,7 +33728,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -33739,6 +33746,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -34006,6 +34014,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -35103,18 +35112,9 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -35214,6 +35214,15 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -36009,21 +36018,9 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -36051,6 +36048,18 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: @@ -36889,11 +36898,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-LABEL: bitcast_v56i16_to_v14f64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 -; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill @@ -36923,6 +36927,11 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v2 +; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 +; SI-NEXT: v_mov_b32_e32 v53, v4 ; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v5 @@ -36939,8 +36948,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v29 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 @@ -37636,6 +37646,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -37708,7 +37719,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -37857,9 +37867,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -37874,6 +37881,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v57, s6 @@ -37892,10 +37902,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -37924,6 +37930,10 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -40031,7 +40041,14 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -40048,13 +40065,6 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -41025,6 +41035,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -41041,11 +41056,6 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -42850,6 +42860,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v59, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v27 +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v26 @@ -42922,7 +42933,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v58 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v59 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -43071,9 +43081,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -43088,6 +43095,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v36, v59, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v35, v58, s6 @@ -43107,10 +43117,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v42, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -43139,6 +43145,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v38, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v37, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -43438,20 +43448,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -43459,6 +43456,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v51 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v61 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -43490,6 +43488,19 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v0, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 @@ -43506,11 +43517,11 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mov_b32_e32 v48, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_mov_b32_e32 v61, v44 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_mov_b32_e32 v39, v11 @@ -43588,7 +43599,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v27, v50, v27 ; SI-NEXT: s_cbranch_execnz .LBB55_3 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 @@ -43607,6 +43617,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: v_or_b32_e32 v1, v3, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v47 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 @@ -43874,6 +43885,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -45383,24 +45395,8 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -45666,6 +45662,22 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -47029,22 +47041,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -47065,6 +47061,22 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -48501,22 +48513,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 @@ -48539,6 +48535,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -49679,22 +49691,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -49716,6 +49712,22 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 4175d5f2de73d..93c11f13ce3ce 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -3108,22 +3108,9 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -3199,6 +3186,19 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -5032,53 +5032,40 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -5109,10 +5096,27 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -5197,6 +5201,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -5341,6 +5346,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -5488,7 +5494,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v30i32: @@ -5770,6 +5776,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -5848,7 +5855,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -6013,9 +6019,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB14_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -6030,6 +6033,9 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -6048,10 +6054,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -6080,6 +6082,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -8381,6 +8387,11 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -8397,11 +8408,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9387,24 +9393,11 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0 ; SI-NEXT: v_or_b32_e32 v36, v38, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_or_b32_e32 v34, v36, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen @@ -9526,6 +9519,19 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: @@ -10339,9 +10345,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -10370,12 +10373,23 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -10385,6 +10399,8 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -10406,18 +10422,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -10427,7 +10434,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -10465,6 +10471,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -10479,7 +10486,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -11351,6 +11357,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -11429,7 +11436,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -11594,9 +11600,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB18_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -11611,6 +11614,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -11630,10 +11636,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -11662,6 +11664,10 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -11982,35 +11988,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -12020,7 +12003,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12029,7 +12012,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -12038,7 +12021,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -12049,12 +12032,38 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -12079,12 +12088,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -12193,10 +12202,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB19_5 ; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -15520,22 +15531,9 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15611,6 +15609,19 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -16541,7 +16552,12 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -16557,11 +16573,6 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: @@ -17559,53 +17570,40 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -17636,10 +17634,27 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -17724,6 +17739,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -17868,6 +17884,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -18015,7 +18032,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v30f32: @@ -18297,6 +18314,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -18375,7 +18393,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -18540,9 +18557,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB30_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -18557,6 +18571,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -18575,10 +18592,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -18607,6 +18620,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -20908,6 +20925,11 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -20924,11 +20946,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -21844,7 +21861,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 ; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen @@ -22011,7 +22028,24 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -22028,23 +22062,6 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB33_4: @@ -23027,9 +23044,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -23058,12 +23072,23 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -23073,6 +23098,8 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -23094,18 +23121,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -23115,7 +23133,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -23153,6 +23170,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -23167,7 +23185,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -24039,6 +24056,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -24117,7 +24135,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -24282,9 +24299,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB34_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -24299,6 +24313,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -24318,10 +24335,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -24350,6 +24363,10 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -24670,6 +24687,50 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill @@ -24694,55 +24755,14 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB35_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -24767,12 +24787,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -24881,10 +24901,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a, ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB35_5 ; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -27278,22 +27300,9 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -27369,6 +27378,19 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -29218,53 +29240,40 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -29295,10 +29304,27 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -29383,6 +29409,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -29527,6 +29554,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -29674,7 +29702,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v15i64: @@ -29956,6 +29984,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -30034,7 +30063,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -30199,9 +30227,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB42_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -30216,6 +30241,9 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -30234,10 +30262,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -30266,6 +30290,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -32568,6 +32596,11 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -32584,11 +32617,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -33590,24 +33618,11 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_add_i32_e32 v39, vcc, 44, v0 ; SI-NEXT: v_or_b32_e32 v36, v38, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v37 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_add_i32_e32 v37, vcc, 48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_or_b32_e32 v34, v36, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen @@ -33729,6 +33744,19 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: @@ -34542,9 +34570,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -34573,12 +34598,23 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -34588,6 +34624,8 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -34609,18 +34647,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -34630,7 +34659,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -34668,6 +34696,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -34682,7 +34711,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -35554,6 +35582,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -35632,7 +35661,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -35797,9 +35825,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB46_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -35814,6 +35839,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -35833,10 +35861,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -35865,6 +35889,10 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -36185,35 +36213,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill @@ -36223,7 +36228,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -36232,7 +36237,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -36241,7 +36246,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -36252,12 +36257,38 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB47_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -36282,12 +36313,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -36396,10 +36427,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB47_5 ; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -37889,22 +37922,9 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -37980,6 +38000,19 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -38866,7 +38899,12 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -38883,11 +38921,6 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: @@ -39855,53 +39888,40 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:108 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:100 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:84 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 @@ -39932,10 +39952,27 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:64 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -40020,6 +40057,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; kill: killed $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v61 ; SI-NEXT: ; kill: killed $vgpr30 @@ -40164,6 +40202,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -40311,7 +40350,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v60i16_to_v15f64: @@ -40593,6 +40632,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -40671,7 +40711,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -40836,9 +40875,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB50_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -40853,6 +40889,9 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 ; GFX9-NEXT: v_perm_b32 v2, v62, v59, s6 @@ -40871,10 +40910,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -40903,6 +40938,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] @@ -43134,24 +43173,8 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -43195,6 +43218,22 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -44250,6 +44289,11 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -44266,11 +44310,6 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: @@ -45223,9 +45262,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 @@ -45254,12 +45290,23 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v7 @@ -45269,6 +45316,8 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 @@ -45290,18 +45339,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v14 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v60 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v16 @@ -45311,7 +45351,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 @@ -45349,6 +45388,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v55 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -45363,7 +45403,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v32 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v47 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -46235,6 +46274,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v61, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 @@ -46313,7 +46353,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v60 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 ; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v61 -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -46478,9 +46517,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB54_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload @@ -46495,6 +46531,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v38, v61, s6 ; GFX9-NEXT: s_movk_i32 s7, 0x200 ; GFX9-NEXT: v_perm_b32 v1, v37, v60, s6 @@ -46514,10 +46553,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_perm_b32 v9, v9, v44, s6 ; GFX9-NEXT: s_waitcnt vmcnt(14) @@ -46546,6 +46581,10 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v21, v21, v48, s6 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_perm_b32 v22, v22, v39, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-NEXT: v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0] @@ -46866,6 +46905,50 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_cvt_f16_f32_e32 v8, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill @@ -46890,55 +46973,14 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB55_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v10, v3 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v34 ; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_or_b32_e32 v10, v32, v10 @@ -46963,12 +47005,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v12, v1 ; SI-NEXT: v_or_b32_e32 v2, v11, v2 @@ -47077,10 +47119,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a ; SI-NEXT: v_mov_b32_e32 v40, v44 ; SI-NEXT: s_cbranch_vccnz .LBB55_5 ; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v59 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v33 @@ -49257,23 +49301,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -49311,6 +49339,22 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -50812,22 +50856,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -50848,6 +50876,22 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -51849,27 +51893,27 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -52404,7 +52448,18 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload @@ -52421,17 +52476,6 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -53215,8 +53259,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -53243,13 +53285,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 @@ -53261,26 +53300,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 -; SI-NEXT: v_mov_b32_e32 v8, v48 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 -; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill @@ -53308,11 +53329,17 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v18, v3, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v51, v11 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 @@ -53355,32 +53382,52 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 ; SI-NEXT: v_lshr_b64 v[58:59], v[34:35], 16 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_mov_b32_e32 v59, v48 ; SI-NEXT: v_or_b32_e32 v6, v3, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v60, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 ; SI-NEXT: v_or_b32_e32 v4, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 -; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 @@ -53477,15 +53524,14 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v32, v41 ; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 ; SI-NEXT: v_lshr_b64 v[20:21], v[11:12], 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[56:57], 16 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v11, v24 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 ; SI-NEXT: v_mov_b32_e32 v39, v31 ; SI-NEXT: v_mov_b32_e32 v31, v60 @@ -53495,6 +53541,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v37, v55 ; SI-NEXT: v_lshr_b64 v[55:56], v[5:6], 16 ; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 ; SI-NEXT: .LBB59_3: ; %end ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 @@ -53615,15 +53662,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 @@ -53634,11 +53681,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -53648,9 +53693,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 @@ -53675,24 +53722,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 @@ -53717,6 +53748,22 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index b6b59d809306a..30ad46d959b7e 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -968,14 +968,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc ; GFX8-NEXT: s_movk_i32 s4, 0x70 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29] +; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28 ; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21] +; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -9552,7 +9552,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_addc_u32_e32 v34, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v35, vcc, 36, v1 ; GFX8-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 ; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill @@ -9564,6 +9563,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX8-NEXT: v_add_u32_e32 v37, vcc, 38, v1 ; GFX8-NEXT: flat_load_ushort v44, v[1:2] ; GFX8-NEXT: v_addc_u32_e32 v38, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v48, vcc, 40, v1 @@ -9686,17 +9686,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v51 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[15:16], v15 -; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xb8, v0 ; GFX8-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xb4, v0 @@ -9827,6 +9816,17 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 ; GFX8-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; GFX8-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index 04f8ad8a02303..68313807c427f 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -450,38 +450,23 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 @@ -991,38 +976,23 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 ; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(15) -; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload ; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 @@ -1189,23 +1159,24 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; SDAG-GFX1100-NEXT: s_mov_b32 s9, s12 ; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-GFX1100-NEXT: s_mov_b32 s6, s3 +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-GFX1100-NEXT: s_mov_b32 s8, s1 ; SDAG-GFX1100-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2 -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-GFX1100-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; SDAG-GFX1100-NEXT: s_clause 0x1 ; SDAG-GFX1100-NEXT: s_load_b32 s13, s[4:5], 0x54 ; SDAG-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; SDAG-GFX1100-NEXT: s_mov_b32 s5, s12 ; SDAG-GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v5, s0 -; SDAG-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; SDAG-GFX1100-NEXT: s_mov_b32 s4, s3 -; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12 +; SDAG-GFX1100-NEXT: v_mov_b32_e32 v5, s0 ; SDAG-GFX1100-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; SDAG-GFX1100-NEXT: s_mov_b32 s13, s2 ; SDAG-GFX1100-NEXT: s_mov_b32 s2, s1 +; SDAG-GFX1100-NEXT: s_mov_b32 s3, s12 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; SDAG-GFX1100-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) @@ -1249,12 +1220,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa ; GISEL-GFX1100-NEXT: s_mov_b32 s8, s1 ; GISEL-GFX1100-NEXT: s_mov_b32 s9, s2 ; GISEL-GFX1100-NEXT: s_mov_b32 s10, s3 +; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; GISEL-GFX1100-NEXT: s_clause 0x1 ; GISEL-GFX1100-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 ; GISEL-GFX1100-NEXT: s_load_b32 s7, s[4:5], 0x54 ; GISEL-GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX1100-NEXT: v_mov_b32_e32 v5, s0 -; GISEL-GFX1100-NEXT: buffer_load_b128 v[0:3], v4, s[8:11], 0 offen ; GISEL-GFX1100-NEXT: s_mov_b32 s4, s1 ; GISEL-GFX1100-NEXT: s_mov_b32 s5, s2 ; GISEL-GFX1100-NEXT: s_mov_b32 s6, s3 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 832e43f1e1973..8e12e7e03947b 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -4253,7 +4253,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -4261,6 +4260,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 ; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 ; VI-NEXT: s_add_u32 s36, s36, s3 @@ -4272,7 +4272,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; VI-NEXT: s_swappc_b64 s[30:31], s[8:9] ; VI-NEXT: s_endpgm @@ -4285,7 +4285,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -4293,6 +4292,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 ; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 ; CI-NEXT: s_add_u32 s36, s36, s3 @@ -4304,7 +4304,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; CI-NEXT: s_swappc_b64 s[30:31], s[8:9] ; CI-NEXT: s_endpgm @@ -4317,7 +4317,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 @@ -4325,6 +4324,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 ; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 ; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 ; GFX9-NEXT: s_mov_b32 s38, -1 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000 ; GFX9-NEXT: s_add_u32 s36, s36, s3 @@ -4336,7 +4336,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 5cc68451d5ab7..0cae0e51107df 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -851,12 +851,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: ds_write_b8 v0, v1 offset:9 -; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; CI-NEXT: ds_write_b8 v0, v1 offset:5 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; CI-NEXT: ds_write_b8 v0, v1 offset:9 +; CI-NEXT: ds_write_b8 v0, v2 offset:13 ; CI-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll index a4b3a8544dede..683887b0a55f3 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll @@ -476,6 +476,7 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24 ; GCN-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse @@ -488,7 +489,6 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; GCN-NEXT: ds_write2_b32 v0, v1, v2 offset0:10 offset1:24 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -1029,6 +1029,7 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24 ; GCN-NEXT: v_accvgpr_write_b32 a31, v21 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a30, v22 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a29, v23 ; Reload Reuse @@ -1039,7 +1040,6 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 { ; GCN-NEXT: v_accvgpr_write_b32 a24, v28 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a23, v29 ; Reload Reuse ; GCN-NEXT: v_accvgpr_write_b32 a22, v30 ; Reload Reuse -; GCN-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index da08f4fcf8f3d..5fb50d0d89530 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -3755,44 +3755,42 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 ; CI-NEXT: v_or_b32_e32 v10, v14, v10 -; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 ; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; CI-NEXT: v_or_b32_e32 v17, v18, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; CI-NEXT: v_or_b32_e32 v13, v16, v13 ; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:12 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; CI-NEXT: v_or_b32_e32 v19, v20, v19 ; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v21 ; CI-NEXT: v_cvt_f16_f32_e32 v21, v30 ; CI-NEXT: v_or_b32_e32 v20, v22, v20 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v29 -; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: s_waitcnt vmcnt(6) ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; CI-NEXT: v_or_b32_e32 v21, v22, v21 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: s_waitcnt vmcnt(3) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; CI-NEXT: s_waitcnt vmcnt(4) +; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v32, v32 @@ -3804,27 +3802,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; CI-NEXT: v_or_b32_e32 v12, v12, v15 -; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 -; CI-NEXT: v_or_b32_e32 v11, v16, v11 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3991,6 +3968,28 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_or_b32_e32 v31, v32, v31 ; CI-NEXT: v_add_i32_e32 v32, vcc, 0x48, v0 ; CI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; CI-NEXT: v_or_b32_e32 v14, v15, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; CI-NEXT: v_or_b32_e32 v12, v12, v15 +; CI-NEXT: v_or_b32_e32 v11, v16, v11 +; CI-NEXT: v_add_i32_e32 v15, vcc, 0x44, v0 ; CI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 ; CI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir index 590d69b8eb869..279f4298e6418 100644 --- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir +++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir @@ -1,19 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s ---- | - - @foo = addrspace(3) global i32 poison - - define void @test_overlap() { unreachable } - define void @test_dead_redef() { unreachable } - define void @test_tied() { unreachable } - define void @test_mmo_merge1() { unreachable } - define void @test_mmo_merge2() { unreachable } - define void @test_mmo_drop() { unreachable } - -... - --- name: test_overlap body: | @@ -60,42 +47,3 @@ body: | %1:vgpr_32 = COPY %0:vgpr_32 %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec ... - ---- -name: test_mmo_merge1 -body: | - bb.0: - ; CHECK-LABEL: name: test_mmo_merge1 - ; CHECK: BUNDLE implicit-def %0, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3) { - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32 - ; CHECK-NEXT: DS_WRITE_B32_gfx9 %1:vgpr_32, internal [[COPY]], 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) - ; CHECK-NEXT: } - %1:vgpr_32 = COPY %0:vgpr_32 - DS_WRITE_B32_gfx9 %0, %1, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) -... - ---- -name: test_mmo_merge2 -body: | - bb.0: - ; CHECK-LABEL: name: test_mmo_merge2 - ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3), (store (s32) into @foo + 4, addrspace 3) { - ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) - ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3) - ; CHECK-NEXT: } - DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) - DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3) -... - ---- -name: test_mmo_drop -body: | - bb.0: - ; CHECK-LABEL: name: test_mmo_drop - ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec { - ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) - ; CHECK-NEXT: DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec - ; CHECK-NEXT: } - DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3) - DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec -... diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index d43c6ba322619..b750d28ffa7d3 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -807,7 +807,7 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX10-NEXT: buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_writelane_b32 v100, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX10-NEXT: s_clause 0x1f ; GFX10-NEXT: buffer_load_dword v95, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v94, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v93, off, s[0:3], s33 offset:8 @@ -863,7 +863,7 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX11-NEXT: s_mov_b32 s1, return_100xi32@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, return_100xi32@abs32@lo ; GFX11-NEXT: s_addk_i32 s32, 0x90 -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Spill +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:124 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:120 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:116 @@ -898,7 +898,7 @@ define amdgpu_gfx void @call_100xi32() #0 { ; GFX11-NEXT: scratch_store_b32 off, v95, s33 ; GFX11-NEXT: v_writelane_b32 v100, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0x1f ; 128-byte Folded Reload +; GFX11-NEXT: s_clause 0x1f ; GFX11-NEXT: scratch_load_b32 v95, off, s33 ; GFX11-NEXT: scratch_load_b32 v94, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v93, off, s33 offset:8 @@ -2416,6 +2416,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:148 ; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:152 ; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:156 +; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104 ; GFX10-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:100 ; GFX10-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:96 ; GFX10-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:92 @@ -2458,7 +2459,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; GFX10-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: s_clause 0x8 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 @@ -2467,7 +2468,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 -; GFX10-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:104 ; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:160 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2518,7 +2518,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-LABEL: return_72xi32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Spill +; GFX11-NEXT: s_clause 0xc ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:212 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:208 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:204 @@ -2551,23 +2551,23 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:96 ; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:92 ; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:88 -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_load_b32 v23, off, s32 offset:112 ; GFX11-NEXT: scratch_load_b32 v22, off, s32 offset:108 ; GFX11-NEXT: scratch_load_b32 v21, off, s32 offset:104 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_load_b32 v19, off, s32 offset:128 ; GFX11-NEXT: scratch_load_b32 v18, off, s32 offset:124 ; GFX11-NEXT: scratch_load_b32 v17, off, s32 offset:120 -; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-NEXT: s_clause 0x10 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: scratch_load_b32 v15, off, s32 offset:144 ; GFX11-NEXT: scratch_load_b32 v14, off, s32 offset:140 ; GFX11-NEXT: scratch_load_b32 v13, off, s32 offset:136 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: s_clause 0xd ; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:160 ; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:156 ; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:152 @@ -2608,7 +2608,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: s_clause 0xc ; 52-byte Folded Reload +; GFX11-NEXT: s_clause 0xc ; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164 ; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168 ; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172 @@ -2641,6 +2641,21 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: s_mov_b32 s34, s32 ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 @@ -2718,21 +2733,6 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: v_mov_b32_e32 v29, 0 ; GFX9-NEXT: v_mov_b32_e32 v30, 0 ; GFX9-NEXT: v_mov_b32_e32 v31, 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v63, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:636 @@ -2914,7 +2914,21 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: s_mov_b32 s38, s34 ; GFX10-NEXT: s_mov_b32 s34, s32 ; GFX10-NEXT: s_add_i32 s32, s32, 0x14000 -; GFX10-NEXT: v_writelane_b32 v63, s30, 0 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 @@ -2957,11 +2971,12 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: v_writelane_b32 v63, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 @@ -2991,24 +3006,9 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_mov_b32_e32 v31, 0 ; GFX10-NEXT: s_mov_b32 s37, return_72xi32@abs32@hi ; GFX10-NEXT: s_mov_b32 s36, return_72xi32@abs32@lo -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_writelane_b32 v63, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: s_clause 0x3e +; GFX10-NEXT: s_clause 0x28 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:636 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:640 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:644 @@ -3050,29 +3050,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792 ; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:516 -; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 -; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 -; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:556 -; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:560 -; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:564 -; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:568 -; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:572 -; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:576 -; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:580 -; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:584 -; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:588 -; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:592 -; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:596 -; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:600 -; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:604 -; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:608 -; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:612 -; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:616 -; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:620 -; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:624 -; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:628 -; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:632 -; GFX10-NEXT: s_waitcnt vmcnt(22) +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -3095,6 +3073,29 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:544 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill +; GFX10-NEXT: s_clause 0x15 +; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 +; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 +; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:556 +; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:560 +; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:564 +; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:568 +; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:572 +; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:576 +; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:580 +; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:584 +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:588 +; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:592 +; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:596 +; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:600 +; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:604 +; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:608 +; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:612 +; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:616 +; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:620 +; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:624 +; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:628 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:632 ; GFX10-NEXT: v_mov_b32_e32 v0, 24 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s32 @@ -3137,7 +3138,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 -; GFX10-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; GFX10-NEXT: s_clause 0x7 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540 ; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544 @@ -3150,7 +3151,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: s_clause 0xe ; 60-byte Folded Reload +; GFX10-NEXT: s_clause 0xe ; GFX10-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 @@ -3198,7 +3199,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_mov_b32 s36, s34 ; GFX11-NEXT: s_mov_b32 s34, s32 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Spill +; GFX11-NEXT: s_clause 0xb ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:44 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:40 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:36 @@ -3340,18 +3341,18 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_add_i32 s2, s32, 16 ; GFX11-NEXT: v_mov_b32_e32 v30, v46 ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s2 -; GFX11-NEXT: s_clause 0x3 ; 64-byte Folded Reload +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 ; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568 ; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552 ; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 ; GFX11-NEXT: s_add_i32 s2, s33, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2 -; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_mov_b32_e32 v1, 42 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_clause 0xb ; 48-byte Folded Reload +; GFX11-NEXT: s_clause 0xb ; GFX11-NEXT: scratch_load_b32 v59, off, s33 ; GFX11-NEXT: scratch_load_b32 v58, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v57, off, s33 offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index 93d7eeb085107..f80716939f618 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -255,11 +255,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16 ; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off ; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 16 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64 diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir index 03b56cad85dac..7e1055b2a28a4 100644 --- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir @@ -11,7 +11,7 @@ body: | ; CHECK-LABEL: name: mimg_nsa ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec { ; CHECK-NEXT: S_CLAUSE 1 ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) @@ -29,7 +29,7 @@ body: | ; CHECK-LABEL: name: mimg_nsa_mixed ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 { ; CHECK-NEXT: S_CLAUSE 2 ; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir index 68f9e839012c3..9689dda9932ed 100644 --- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir @@ -10,7 +10,7 @@ body: | ; CHECK-LABEL: name: mimg ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec { ; CHECK-NEXT: S_CLAUSE 1 ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) @@ -28,7 +28,7 @@ body: | ; CHECK-LABEL: name: mimg_mixed ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) { + ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 { ; CHECK-NEXT: S_CLAUSE 2 ; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128)) ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll index cbf697fafe683..4719ab9090fa5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll @@ -1,20 +1,13 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s +; MIR-LABEL: name: gws_barrier_offset0{{$}} +; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec { +; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") +; MIR-NEXT: S_WAITCNT 0 +; MIR-NEXT: } define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { - ; MIR-LABEL: name: gws_barrier_offset0 - ; MIR: bb.0 (%ir-block.0): - ; MIR-NEXT: liveins: $sgpr8_sgpr9 - ; MIR-NEXT: {{ $}} - ; MIR-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4) - ; MIR-NEXT: $m0 = S_MOV_B32 0 - ; MIR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec - ; MIR-NEXT: BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") { - ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") - ; MIR-NEXT: S_WAITCNT 0 - ; MIR-NEXT: } - ; MIR-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) ret void } @@ -24,3 +17,5 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { convergent inaccessiblememonly nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; MIR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll index 417b8e08cf669..c5f6e2b0098ae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -35,7 +35,7 @@ ; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] ; MIR-LABEL: name: gws_barrier_offset0{{$}} -; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec +; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec { ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") ; MIR-NEXT: S_WAITCNT 0 ; MIR-NEXT: } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll index af270e5adf75c..4419b8c6f9862 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -13,9 +13,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-TRUE16-NEXT: v_dot2_bf16_bf16 v0.l, s2, s3, v0.l ; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -26,9 +26,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( ; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-FAKE16-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1 ; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll index 72b47693c69f8..0194d25a99cdc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -12,9 +12,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; SDAG-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; SDAG-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l ; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -25,9 +25,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; SDAG-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; SDAG-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; SDAG-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SDAG-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1 ; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] @@ -38,9 +38,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GISEL-GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GISEL-GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GISEL-GFX11-TRUE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GISEL-GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[6:7] ; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GISEL-GFX11-TRUE16-NEXT: v_dot2_f16_f16 v0.l, s2, s3, v0.l ; GISEL-GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -51,9 +51,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( ; GISEL-GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GISEL-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GISEL-GFX11-FAKE16-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GISEL-GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[6:7] ; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GISEL-GFX11-FAKE16-NEXT: v_dot2_f16_f16 v1, s2, s3, v1 ; GISEL-GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 1d08097452ce6..0c1448a0b8fb6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -17,19 +17,21 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[0:1] +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; SDAG-NEXT: v_mov_b32_e32 v5, s16 -; SDAG-NEXT: v_mov_b32_e32 v4, 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] @@ -41,12 +43,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] @@ -172,15 +175,16 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[10:11] @@ -203,15 +207,16 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1] +; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] @@ -515,19 +520,21 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 ; GCN-NEXT: v_mov_b64_e32 v[16:17], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[14:15], s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[8:9], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[12:13], s[14:15] ; GCN-NEXT: v_mov_b32_e32 v5, s16 -; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] @@ -627,15 +634,16 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 ; GCN-NEXT: v_mov_b64_e32 v[28:29], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; GCN-NEXT: v_mov_b64_e32 v[22:23], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[20:21], s[10:11] @@ -794,11 +802,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -807,6 +815,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -824,11 +833,12 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -955,14 +965,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -992,14 +1003,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -1305,11 +1317,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1318,6 +1330,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1335,11 +1348,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1467,11 +1481,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1480,6 +1494,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1497,11 +1512,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1629,11 +1645,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1642,6 +1658,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1659,11 +1676,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1791,11 +1809,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: v_mov_b32_e32 v12, s8 ; SDAG-NEXT: v_mov_b32_e32 v13, s9 ; SDAG-NEXT: v_mov_b32_e32 v14, s10 @@ -1804,6 +1822,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v4, s0 ; SDAG-NEXT: v_mov_b32_e32 v5, s1 ; SDAG-NEXT: v_mov_b32_e32 v6, s2 @@ -1821,11 +1840,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] @@ -1952,14 +1972,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -1989,14 +2010,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -2301,14 +2323,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -2338,14 +2361,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -2650,14 +2674,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -2687,14 +2712,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] @@ -2999,14 +3025,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 ; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 ; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 ; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b32_e32 v24, s8 ; SDAG-NEXT: v_mov_b32_e32 v25, s9 ; SDAG-NEXT: v_mov_b32_e32 v26, s10 @@ -3036,14 +3063,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 -; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 +; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19] diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 83c240c17ff1c..f93e5f06beff9 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -10386,8 +10386,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_add_u32 s2, s0, 0x150 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] +; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] ; GFX8-NEXT: v_mov_b32_e32 v13, s3 ; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x140 @@ -10396,6 +10395,10 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x130 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] +; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v17, s3 ; GFX8-NEXT: v_mov_b32_e32 v16, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x120 @@ -10403,21 +10406,20 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v19, s3 ; GFX8-NEXT: v_mov_b32_e32 v18, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x110 -; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo ; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi ; GFX8-NEXT: v_mov_b32_e32 v6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 +; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] ; GFX8-NEXT: v_mov_b32_e32 v10, s14 ; GFX8-NEXT: v_mov_b32_e32 v11, s15 ; GFX8-NEXT: flat_store_dwordx4 v[56:57], v[28:31] @@ -10586,8 +10588,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] -; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 59f4a9d44bbdd..bca39d06e941c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v22, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v2, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 7203545ebf9a8..e55fb2cac0985 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -3313,12 +3313,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] +; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] -; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 ; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6 @@ -3726,6 +3726,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: s_nop 0 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[37:40], s[0:1] offset:224 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:240 +; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_nop 0 ; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload @@ -3739,7 +3740,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v26 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v27 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v28 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(11) +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(12) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v3 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v2 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v1 @@ -3748,7 +3749,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v1 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v2 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v3 -; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:192 ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v24 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v23 @@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v21 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, v21 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v10, v22 -; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(1) +; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[33:36], s[0:1] offset:208 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[41:44], s[0:1] offset:160 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v12, v[29:32], s[0:1] offset:176 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index cb17f01853221..f879dc660203f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -7788,18 +7788,19 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v13, 16, 8 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v14, 8, 8 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v17, 8, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v17 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 24, v17 ; GCN-NOHSA-VI-NEXT: v_bfe_u32 v52, v17, 16, 8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v53 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128 @@ -7809,7 +7810,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v53 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index 062a985dd7180..bd191a37582c0 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -3172,25 +3172,27 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3 -; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill +; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18 -; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 ; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 @@ -3198,6 +3200,7 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20 ; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17 +; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5 ; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19 @@ -3240,19 +3243,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19 ; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 -; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20 ; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 +; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19 ; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18 ; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0 -; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21 ; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24 @@ -3295,17 +3296,21 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 +; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX9-NO-DS128-NEXT: s_nop 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12 @@ -3332,11 +3337,9 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: s_nop 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17 @@ -3357,17 +3360,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 +; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16 ; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19 @@ -3804,11 +3806,9 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; VI-DS128-NEXT: v_mov_b32_e32 v4, v3 ; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 @@ -3825,16 +3825,23 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 ; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 ; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 +; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill +; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 ; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 @@ -3843,25 +3850,21 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 ; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 ; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -3872,17 +3875,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 ; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 +; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 -; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -3941,11 +3943,9 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 ; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 @@ -3964,16 +3964,24 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 ; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 ; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 +; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: s_nop 0 +; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 ; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 ; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 @@ -3982,26 +3990,21 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 ; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 ; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -4012,17 +4015,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 ; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 ; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 +; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 -; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -4195,20 +4197,29 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; VI-NO-DS128: ; %bb.0: ; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NO-DS128-NEXT: s_mov_b32 m0, -1 ; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-NO-DS128-NEXT: s_mov_b32 s90, -1 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 -; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 -; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 -; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 ; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 ; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 ; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 +; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 +; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill +; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 +; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 +; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 @@ -4218,7 +4229,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 -; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 ; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 @@ -4236,24 +4247,16 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 ; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 -; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill -; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 +; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 +; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 -; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 ; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 @@ -4313,14 +4316,23 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 -; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3 ; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 +; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill +; GFX9-NO-DS128-NEXT: s_nop 0 +; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7 +; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29 @@ -4330,7 +4342,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11 -; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33 @@ -4348,24 +4360,16 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16 ; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16 -; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill -; GFX9-NO-DS128-NEXT: s_nop 0 -; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16 +; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 +; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16 ; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12 -; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15 -; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16 ; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16 ; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14 @@ -4853,12 +4857,10 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_mov_b32_e32 v32, s1 ; VI-DS128-NEXT: ds_read_b128 v[8:11], v32 ; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 -; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(3) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 @@ -4871,6 +4873,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 ; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 +; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill +; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 +; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 @@ -4891,11 +4899,8 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 ; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 -; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill -; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill ; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -4908,15 +4913,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 @@ -4981,11 +4985,9 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1 ; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32 ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 @@ -4999,6 +5001,13 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 ; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 +; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: s_nop 0 +; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 +; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 @@ -5019,12 +5028,8 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 ; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 -; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: s_nop 0 -; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 @@ -5037,15 +5042,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll index 9da7a79ba2fdf..1d1d3e4a68fee 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll @@ -15,23 +15,24 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_mov_b32 s8, s1 ; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_mov_b32 s4, s3 -; GFX12-NEXT: s_mov_b32 s3, s12 +; GFX12-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_mov_b32 s2, s1 +; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -62,10 +63,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, s3 @@ -99,24 +100,25 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i ; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_mov_b32 s8, s1 ; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_mov_b32 s4, s3 -; GFX12-NEXT: s_mov_b32 s3, s12 +; GFX12-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_mov_b32 s2, s1 +; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen @@ -139,23 +141,24 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7) ; GFX12-NEXT: s_mov_b32 s9, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_mov_b32 s6, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-NEXT: s_mov_b32 s8, s1 ; GFX12-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 -; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-NEXT: s_mov_b32 s5, s12 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU ; GFX12-NEXT: s_mov_b32 s4, s3 -; GFX12-NEXT: s_mov_b32 s3, s12 +; GFX12-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-NEXT: s_mov_b32 s13, s2 ; GFX12-NEXT: s_mov_b32 s2, s1 +; GFX12-NEXT: s_mov_b32 s3, s12 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll index 84db54c2d537f..fc36ed939d91d 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll @@ -128,10 +128,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] ; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 ; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] +; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc ; GFX10-SDAG-NEXT: s_clause 0x1 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 -; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen slc ; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -181,23 +181,24 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX11-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s6, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX11-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -214,12 +215,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s10, s3 +; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc ; GFX11-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s6, s3 @@ -238,23 +239,24 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 s6, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 -; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX12-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -271,12 +273,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p ; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s10, s3 +; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT ; GFX12-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s6, s3 @@ -411,11 +413,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX10-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[10:11] ; GFX10-SDAG-NEXT: s_mov_b32 s11, s2 ; GFX10-SDAG-NEXT: s_or_b64 s[4:5], s[12:13], s[10:11] +; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: s_clause 0x1 ; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 ; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 -; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX10-SDAG-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SDAG-NEXT: s_mov_b32 s5, s10 ; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -466,24 +468,25 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s6, s3 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX11-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_mov_b32 s4, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s0 ; GFX11-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX11-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX11-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX11-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc @@ -500,13 +503,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s10, s3 +; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s6, s3 @@ -525,24 +528,25 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-SDAG-NEXT: s_mov_b32 s9, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 s6, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: s_mov_b32 s8, s1 ; GFX12-SDAG-NEXT: s_or_b64 s[10:11], s[6:7], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_clause 0x1 ; GFX12-SDAG-NEXT: s_load_b32 s13, s[4:5], 0x30 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-SDAG-NEXT: s_mov_b32 s5, s12 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-SDAG-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 s4, s3 -; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-SDAG-NEXT: s_or_b64 s[6:7], s[4:5], s[12:13] ; GFX12-SDAG-NEXT: s_mov_b32 s13, s2 ; GFX12-SDAG-NEXT: s_mov_b32 s2, s1 +; GFX12-SDAG-NEXT: s_mov_b32 s3, s12 ; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-SDAG-NEXT: s_or_b64 s[4:5], s[2:3], s[12:13] ; GFX12-SDAG-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS @@ -559,13 +563,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp ; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s9, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s10, s3 +; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_clause 0x1 ; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 ; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: s_mov_b32 s4, s1 ; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 ; GFX12-GISEL-NEXT: s_mov_b32 s6, s3 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll index c1f4d7bbf650e..4ab05c2923fdb 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -19,12 +19,12 @@ $_f2 = comdat any define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce) local_unnamed_addr #0 { ; GCN-LABEL: test: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v0, 2 ; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: ds_write_b8 v1, v0 ; GCN-NEXT: ds_read_u8 v2, v1 offset:2 ; GCN-NEXT: ds_read_u16 v3, v1 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, 2 -; GCN-NEXT: ds_write_b8 v1, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_write_b8 v1, v2 offset:6 ; GCN-NEXT: ds_write_b16 v1, v3 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll index ccfd45bc87e71..24c1bfb8d50f0 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -75,15 +75,15 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i ; GCN-LABEL: no_clobber_ds_load_stores_x3: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c -; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 2 -; GCN-NEXT: ds_write_b32 v1, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: ds_write_b32 v1, v2 offset:256 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s0, s0, 2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: ds_write_b32 v1, v2 offset:256 ; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: ds_write_b32 v1, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ds_write_b32 v1, v2 offset:512 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v3, v0 offset:256 diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index ba532949a687d..ae0805448d693 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -774,9 +774,9 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset ; GFX1250-NEXT: s_load_b32 s6, s[0:1], 0x0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_max_u32_e32 v0, s6, v0 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index af7ca0fb59682..02f39e25cb447 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -714,7 +714,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; ALIGNED-NEXT: s_clause 0x7 ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -1468,7 +1468,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) ; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB1_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; ALIGNED-NEXT: s_clause 0x7 ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -1854,10 +1854,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 ; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 @@ -1866,6 +1862,10 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 ; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 ; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 ; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 @@ -1901,6 +1901,14 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 @@ -1915,14 +1923,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 ; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 @@ -3438,7 +3438,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB3_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload +; ALIGNED-NEXT: s_clause 0x2f ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 @@ -3741,23 +3741,23 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill ; ALIGNED-NEXT: .LBB4_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: s_clause 0x39 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 @@ -3779,17 +3779,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69 @@ -3797,96 +3797,57 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_clause 0x33 -; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: s_waitcnt vmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(49) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(45) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 +; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 +; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 +; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 @@ -3895,83 +3856,76 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 -; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill @@ -3980,82 +3934,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(41) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(39) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(35) +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill @@ -4327,132 +4251,259 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 -; ALIGNED-NEXT: s_clause 0x3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v127, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x6 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 ; ALIGNED-NEXT: v_lshl_or_b32 v106, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v103, 8, v114 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v100, 8, v112 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 ; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(60) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v98 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v96 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) +; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v89, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v83 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v81 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v71 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v46, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 ; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v70 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 ; ALIGNED-NEXT: v_lshl_or_b32 v115, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v48 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 ; ALIGNED-NEXT: v_lshl_or_b32 v99, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v33 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v82, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v31, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 ; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v21 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 ; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 ; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 ; ALIGNED-NEXT: v_lshl_or_b32 v22, v73, 16, v4 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 16, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v6, 8, v8 ; ALIGNED-NEXT: v_lshl_or_b32 v77, v7, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 16, v73 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4462,34 +4513,37 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v73, v109, 8, v107 ; ALIGNED-NEXT: v_lshl_or_b32 v77, v1, 8, v120 -; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 ; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v1 -; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0 +; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v73, v120, 8, v122 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v77, v121, 8, v109 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 @@ -4499,7 +4553,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4572,7 +4625,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:202 ; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:203 ; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:201 @@ -4589,6 +4641,21 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:198 ; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:196 ; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:192 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:186 ; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:187 ; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:185 @@ -4605,6 +4672,18 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:182 ; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:180 ; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:176 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:170 ; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:171 ; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:169 @@ -4621,36 +4700,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:166 ; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:164 ; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:160 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 @@ -4663,6 +4712,10 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154 +; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 @@ -5128,8 +5181,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload @@ -5183,6 +5234,8 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 @@ -5221,7 +5274,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB4_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload +; ALIGNED-NEXT: s_clause 0x2f ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 @@ -6744,7 +6797,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5 ; ALIGNED-NEXT: .LBB5_6: ; %Flow6 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; ALIGNED-NEXT: s_clause 0x7 ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -8243,7 +8296,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5 ; ALIGNED-NEXT: .LBB6_6: ; %Flow8 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload +; ALIGNED-NEXT: s_clause 0x7 ; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 @@ -8795,6 +8848,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6 @@ -8810,14 +8871,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 ; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7 @@ -9244,10 +9297,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 ; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 @@ -9256,6 +9305,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 ; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 ; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 ; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 ; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 ; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 @@ -9291,6 +9344,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 @@ -9305,14 +9366,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 ; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 @@ -12145,7 +12198,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB8_5 ; ALIGNED-NEXT: .LBB8_6: ; %Flow19 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6 -; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload +; ALIGNED-NEXT: s_clause 0x2f ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 @@ -12592,11 +12645,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-LABEL: memmove_p0_p5_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 -; ALIGNED-NEXT: s_mov_b32 s6, exec_lo ; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill @@ -12645,29 +12693,34 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 +; ALIGNED-NEXT: s_mov_b32 s6, exec_lo ; ALIGNED-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v0 ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB9_2 ; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: s_clause 0x39 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 @@ -12689,17 +12742,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69 @@ -12707,94 +12760,58 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_clause 0x30 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: s_waitcnt vmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(53) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(45) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8 +; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 +; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25 +; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(29) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 @@ -12802,81 +12819,82 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4 -; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(61) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(61) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(61) +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill @@ -12884,97 +12902,47 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0xc -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(44) +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(43) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(42) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(33) +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80 -; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 -; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill @@ -13246,158 +13214,289 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 -; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 -; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 -; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92 -; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79 -; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 -; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 -; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57 -; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 +; ALIGNED-NEXT: s_waitcnt vmcnt(61) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(59) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 +; ALIGNED-NEXT: s_waitcnt vmcnt(13) +; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 +; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 +; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 +; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 +; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 +; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 +; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 +; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 ; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen ; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 ; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 ; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 ; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95 ; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125 ; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill @@ -13410,6 +13509,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16 @@ -13418,11 +13518,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -13491,8 +13590,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202 ; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203 ; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201 @@ -13509,6 +13606,22 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198 ; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196 ; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186 ; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187 ; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185 @@ -13525,6 +13638,18 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182 ; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180 ; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170 ; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171 ; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169 @@ -13541,36 +13666,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166 ; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164 ; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 -; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 @@ -13583,6 +13678,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 +; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 @@ -14048,8 +14147,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload @@ -14103,6 +14200,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 +; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 @@ -14154,23 +14253,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: s_clause 0x39 ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 ; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 ; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33 ; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 ; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37 @@ -14192,17 +14291,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61 ; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65 ; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 ; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67 ; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68 ; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69 @@ -14210,97 +14309,57 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 ; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 ; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 -; ALIGNED-NEXT: s_clause 0x34 -; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 -; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 -; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 -; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 -; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 -; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 -; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: s_waitcnt vmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(54) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(53) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(49) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(45) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 +; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 +; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 +; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 @@ -14309,88 +14368,75 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3 -; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 ; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_clause 0x5 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill @@ -14399,83 +14445,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(50) -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) -; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(44) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(35) +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -14748,126 +14763,259 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x7 +; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 ; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252 ; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253 ; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254 ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x6 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 ; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 ; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(60) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) +; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 ; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 ; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 ; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 ; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 ; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 ; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 ; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 ; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen ; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -14879,34 +15027,36 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104 ; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x2 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16 @@ -14915,11 +15065,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14988,8 +15137,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202 ; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203 ; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201 @@ -15006,6 +15153,22 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198 ; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196 ; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192 +; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 +; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 +; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186 ; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187 ; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185 @@ -15022,6 +15185,18 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182 ; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180 ; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170 ; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171 ; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169 @@ -15038,36 +15213,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166 ; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164 ; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160 -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 -; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 -; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 -; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 @@ -15080,6 +15225,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 +; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 +; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157 @@ -15545,10 +15694,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:640 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload @@ -15602,6 +15747,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 +; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 +; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 +; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 +; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15 @@ -15639,7 +15788,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4 ; ALIGNED-NEXT: .LBB9_5: ; %Flow11 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload +; ALIGNED-NEXT: s_clause 0x2f ; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32 ; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_load_dword v125, off, s[0:3], s32 offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll index 32800488f0633..71900a4d1c1e4 100644 --- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll +++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll @@ -90,19 +90,19 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh ; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 ; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null ; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null ; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null -; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1 +; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1 -; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1 ; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 1177474f5b4f5..78207c2cf605e 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -185,47 +185,44 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] ; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 +; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; GFX900-NEXT: v_add_f32_e32 v4, s43, v4 ; GFX900-NEXT: v_add_f32_e32 v3, s42, v3 ; GFX900-NEXT: v_add_f32_e32 v2, s41, v2 ; GFX900-NEXT: v_add_f32_e32 v1, s40, v1 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_add_f32_e32 v8, s39, v8 +; GFX900-NEXT: v_add_f32_e32 v7, s38, v7 +; GFX900-NEXT: v_add_f32_e32 v6, s37, v6 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_add_f32_e32 v32, s19, v32 ; GFX900-NEXT: v_add_f32_e32 v31, s18, v31 ; GFX900-NEXT: v_add_f32_e32 v30, s17, v30 ; GFX900-NEXT: v_add_f32_e32 v29, s16, v29 -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_f32_e32 v8, s39, v8 -; GFX900-NEXT: v_add_f32_e32 v7, s38, v7 -; GFX900-NEXT: v_add_f32_e32 v6, s37, v6 ; GFX900-NEXT: v_add_f32_e32 v5, s36, v5 -; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_add_f32_e32 v12, s51, v12 ; GFX900-NEXT: v_add_f32_e32 v11, s50, v11 ; GFX900-NEXT: v_add_f32_e32 v10, s49, v10 ; GFX900-NEXT: v_add_f32_e32 v9, s48, v9 -; GFX900-NEXT: s_waitcnt vmcnt(2) ; GFX900-NEXT: v_add_f32_e32 v16, s47, v16 ; GFX900-NEXT: v_add_f32_e32 v15, s46, v15 ; GFX900-NEXT: v_add_f32_e32 v14, s45, v14 ; GFX900-NEXT: v_add_f32_e32 v13, s44, v13 -; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_add_f32_e32 v20, s15, v20 ; GFX900-NEXT: v_add_f32_e32 v19, s14, v19 ; GFX900-NEXT: v_add_f32_e32 v18, s13, v18 ; GFX900-NEXT: v_add_f32_e32 v17, s12, v17 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_add_f32_e32 v24, s11, v24 ; GFX900-NEXT: v_add_f32_e32 v23, s10, v23 ; GFX900-NEXT: v_add_f32_e32 v22, s9, v22 @@ -249,8 +246,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] @@ -260,7 +255,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 -; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[40:41] ; PACKED-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[42:43] ; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) @@ -296,8 +293,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -307,7 +302,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[36:37] ; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[38:39] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) @@ -343,14 +340,11 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fadd_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_clause 0x2 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40 +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 ; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 ; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 @@ -360,18 +354,22 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 ; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] @@ -411,9 +409,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-GISEL-NEXT: s_clause 0x1 -; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -426,6 +421,10 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 ; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 ; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] @@ -1443,47 +1442,44 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] ; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 +; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; GFX900-NEXT: v_mul_f32_e32 v4, s43, v4 ; GFX900-NEXT: v_mul_f32_e32 v3, s42, v3 ; GFX900-NEXT: v_mul_f32_e32 v2, s41, v2 ; GFX900-NEXT: v_mul_f32_e32 v1, s40, v1 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8 +; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7 +; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mul_f32_e32 v32, s19, v32 ; GFX900-NEXT: v_mul_f32_e32 v31, s18, v31 ; GFX900-NEXT: v_mul_f32_e32 v30, s17, v30 ; GFX900-NEXT: v_mul_f32_e32 v29, s16, v29 -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_mul_f32_e32 v8, s39, v8 -; GFX900-NEXT: v_mul_f32_e32 v7, s38, v7 -; GFX900-NEXT: v_mul_f32_e32 v6, s37, v6 ; GFX900-NEXT: v_mul_f32_e32 v5, s36, v5 -; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_mul_f32_e32 v12, s51, v12 ; GFX900-NEXT: v_mul_f32_e32 v11, s50, v11 ; GFX900-NEXT: v_mul_f32_e32 v10, s49, v10 ; GFX900-NEXT: v_mul_f32_e32 v9, s48, v9 -; GFX900-NEXT: s_waitcnt vmcnt(2) ; GFX900-NEXT: v_mul_f32_e32 v16, s47, v16 ; GFX900-NEXT: v_mul_f32_e32 v15, s46, v15 ; GFX900-NEXT: v_mul_f32_e32 v14, s45, v14 ; GFX900-NEXT: v_mul_f32_e32 v13, s44, v13 -; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_mul_f32_e32 v20, s15, v20 ; GFX900-NEXT: v_mul_f32_e32 v19, s14, v19 ; GFX900-NEXT: v_mul_f32_e32 v18, s13, v18 ; GFX900-NEXT: v_mul_f32_e32 v17, s12, v17 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mul_f32_e32 v24, s11, v24 ; GFX900-NEXT: v_mul_f32_e32 v23, s10, v23 ; GFX900-NEXT: v_mul_f32_e32 v22, s9, v22 @@ -1507,8 +1503,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] @@ -1518,7 +1512,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 -; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; PACKED-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[40:41] ; PACKED-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[42:43] ; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) @@ -1554,8 +1550,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -1565,7 +1559,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[36:37] ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[38:39] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) @@ -1601,14 +1597,11 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fmul_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_clause 0x2 ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40 +; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 ; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 ; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 @@ -1618,18 +1611,22 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 ; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v44, s46 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] @@ -1669,9 +1666,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-GISEL-NEXT: s_clause 0x1 -; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -1684,6 +1678,10 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 ; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 ; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] @@ -2275,47 +2273,44 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 -; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[0:1] offset:16 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[0:1] ; GFX900-NEXT: global_load_dwordx4 v[9:12], v0, s[0:1] offset:48 ; GFX900-NEXT: global_load_dwordx4 v[13:16], v0, s[0:1] offset:32 ; GFX900-NEXT: global_load_dwordx4 v[17:20], v0, s[0:1] offset:80 ; GFX900-NEXT: global_load_dwordx4 v[21:24], v0, s[0:1] offset:64 -; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; GFX900-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; GFX900-NEXT: global_load_dwordx4 v[25:28], v0, s[0:1] offset:112 +; GFX900-NEXT: global_load_dwordx4 v[29:32], v0, s[0:1] offset:96 +; GFX900-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; GFX900-NEXT: v_fma_f32 v4, v4, s43, s43 ; GFX900-NEXT: v_fma_f32 v3, v3, s42, s42 ; GFX900-NEXT: v_fma_f32 v2, v2, s41, s41 ; GFX900-NEXT: v_fma_f32 v1, v1, s40, s40 +; GFX900-NEXT: s_waitcnt vmcnt(6) +; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39 +; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38 +; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37 +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_fma_f32 v32, v32, s19, s19 ; GFX900-NEXT: v_fma_f32 v31, v31, s18, s18 ; GFX900-NEXT: v_fma_f32 v30, v30, s17, s17 ; GFX900-NEXT: v_fma_f32 v29, v29, s16, s16 -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_fma_f32 v8, v8, s39, s39 -; GFX900-NEXT: v_fma_f32 v7, v7, s38, s38 -; GFX900-NEXT: v_fma_f32 v6, v6, s37, s37 ; GFX900-NEXT: v_fma_f32 v5, v5, s36, s36 -; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_fma_f32 v12, v12, s51, s51 ; GFX900-NEXT: v_fma_f32 v11, v11, s50, s50 ; GFX900-NEXT: v_fma_f32 v10, v10, s49, s49 ; GFX900-NEXT: v_fma_f32 v9, v9, s48, s48 -; GFX900-NEXT: s_waitcnt vmcnt(2) ; GFX900-NEXT: v_fma_f32 v16, v16, s47, s47 ; GFX900-NEXT: v_fma_f32 v15, v15, s46, s46 ; GFX900-NEXT: v_fma_f32 v14, v14, s45, s45 ; GFX900-NEXT: v_fma_f32 v13, v13, s44, s44 -; GFX900-NEXT: s_waitcnt vmcnt(1) ; GFX900-NEXT: v_fma_f32 v20, v20, s15, s15 ; GFX900-NEXT: v_fma_f32 v19, v19, s14, s14 ; GFX900-NEXT: v_fma_f32 v18, v18, s13, s13 ; GFX900-NEXT: v_fma_f32 v17, v17, s12, s12 -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_fma_f32 v24, v24, s11, s11 ; GFX900-NEXT: v_fma_f32 v23, v23, s10, s10 ; GFX900-NEXT: v_fma_f32 v22, v22, s9, s9 @@ -2339,8 +2334,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] offset:16 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] @@ -2350,7 +2343,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-SDAG-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:64 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:112 ; PACKED-SDAG-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:96 -; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) +; PACKED-SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; PACKED-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41] ; PACKED-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43] ; PACKED-SDAG-NEXT: s_waitcnt vmcnt(6) @@ -2386,8 +2381,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; PACKED-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 -; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; PACKED-GISEL-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 @@ -2397,7 +2390,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; PACKED-GISEL-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; PACKED-GISEL-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) +; PACKED-GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 +; PACKED-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 +; PACKED-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(0) ; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37] ; PACKED-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(6) @@ -2435,9 +2430,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_clause 0x1 -; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 @@ -2450,6 +2442,10 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 ; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 ; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43] ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51] @@ -2500,9 +2496,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-GISEL-NEXT: s_clause 0x1 -; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -2515,6 +2508,10 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 ; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 ; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir index 025d9e63436d7..d0d5cc11994af 100644 --- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir +++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir @@ -56,11 +56,11 @@ body: | ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: } - ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (load (s32)) { + ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { ; GCN-NEXT: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) ; GCN-NEXT: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32)) ; GCN-NEXT: } - ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (store (s128)) { + ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { ; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128)) ; GCN-NEXT: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128)) ; GCN-NEXT: } @@ -359,7 +359,6 @@ tracksRegLiveness: true body: | bb.0: ; GCN-LABLE: name: no_sched_barrier_within_bundle - ; GCN-LABEL: name: no_sched_barrier_within_bundle ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec { diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir index e0266b9f1a5b0..5fea0aee72ec7 100644 --- a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir @@ -9,7 +9,7 @@ body: | ; GFX12-LABEL: name: post_bundle_vimage ; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) { + ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { ; GFX12-NEXT: $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) ; GFX12-NEXT: $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) ; GFX12-NEXT: } @@ -25,7 +25,7 @@ body: | ; GFX12-LABEL: name: post_bundle_vsample ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 :: (dereferenceable load (s128), addrspace 8) { + ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 { ; GFX12-NEXT: $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8) ; GFX12-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8) ; GFX12-NEXT: } diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index b91bdd2b2fa71..85a9aba1a0e51 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -398,11 +398,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5] ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2 ; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] ; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2 ; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] @@ -514,8 +514,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048 ; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2 +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off ; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 ; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off @@ -524,15 +526,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2 ; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 ; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048 ; GFX900-NEXT: s_addk_i32 s5, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc ; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096 -; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: s_waitcnt vmcnt(3) ; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22 ; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1] ; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 @@ -540,13 +540,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(7) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24 ; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off ; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(7) +; GFX900-NEXT: s_waitcnt vmcnt(5) ; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20 ; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc ; GFX900-NEXT: s_waitcnt vmcnt(4) @@ -734,8 +734,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 ; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off ; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 ; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off @@ -751,42 +753,39 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048 ; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc ; GFX90A-NEXT: s_addk_i32 s3, 0x2000 ; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff -; GFX90A-NEXT: s_waitcnt vmcnt(10) +; GFX90A-NEXT: s_waitcnt vmcnt(8) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(9) +; GFX90A-NEXT: s_waitcnt vmcnt(7) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(8) +; GFX90A-NEXT: s_waitcnt vmcnt(6) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(7) +; GFX90A-NEXT: s_waitcnt vmcnt(5) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(6) +; GFX90A-NEXT: s_waitcnt vmcnt(4) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(5) +; GFX90A-NEXT: s_waitcnt vmcnt(3) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(4) +; GFX90A-NEXT: s_waitcnt vmcnt(2) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(3) +; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc ; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index fb9c47731eb42..7a3bff8aed56e 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -28,29 +28,15 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 @@ -58,8 +44,27 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -69,40 +74,49 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 +; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -111,19 +125,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -138,7 +139,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -153,70 +153,84 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -224,19 +238,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -251,7 +252,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -266,69 +266,83 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -336,18 +350,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -362,8 +364,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -392,6 +392,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -413,6 +414,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -420,23 +422,24 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -444,8 +447,9 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -453,9 +457,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -474,8 +475,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W32-MUBUF-NEXT: s_clause 0x1 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -504,6 +503,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -525,6 +525,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -532,23 +533,24 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -556,8 +558,9 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -565,9 +568,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -586,8 +586,6 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: s_clause 0x1 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -976,43 +974,42 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 +; GFX11-FLATSCR-NEXT: s_clause 0x3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -1027,7 +1024,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -1053,29 +1051,15 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 @@ -1083,8 +1067,27 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -1094,40 +1097,49 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 +; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -1136,19 +1148,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -1163,7 +1162,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1178,70 +1176,84 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -1249,19 +1261,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -1276,7 +1275,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1291,69 +1289,83 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -1361,18 +1373,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -1387,8 +1387,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1417,6 +1415,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -1438,6 +1437,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -1445,23 +1445,24 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -1469,8 +1470,9 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -1478,9 +1480,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -1499,8 +1498,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W32-MUBUF-NEXT: s_clause 0x1 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -1529,6 +1526,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -1550,6 +1548,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -1557,23 +1556,24 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -1581,8 +1581,9 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -1590,9 +1591,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -1611,8 +1609,6 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: s_clause 0x1 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2001,43 +1997,42 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 +; GFX11-FLATSCR-NEXT: s_clause 0x3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -2052,7 +2047,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -2078,29 +2074,15 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 @@ -2108,8 +2090,27 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -2119,61 +2120,57 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 +; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 -; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:772 -; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:772 +; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -2188,7 +2185,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2203,70 +2199,84 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -2274,19 +2284,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -2301,7 +2298,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2316,69 +2312,83 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -2386,18 +2396,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -2412,8 +2410,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2442,6 +2438,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -2463,6 +2460,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -2470,23 +2468,24 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -2494,8 +2493,9 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -2503,9 +2503,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -2524,8 +2521,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W32-MUBUF-NEXT: s_clause 0x1 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -2554,6 +2549,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -2575,6 +2571,7 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -2582,23 +2579,24 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -2606,8 +2604,9 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -2615,9 +2614,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -2636,8 +2632,6 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: s_clause 0x1 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3026,43 +3020,42 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 +; GFX11-FLATSCR-NEXT: s_clause 0x3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -3077,7 +3070,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -3103,29 +3097,15 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 @@ -3133,8 +3113,27 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -3144,40 +3143,49 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 +; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -3186,19 +3194,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -3213,7 +3208,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3228,70 +3222,84 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -3299,19 +3307,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -3326,7 +3321,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3340,69 +3334,83 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -3410,18 +3418,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -3436,8 +3432,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3465,6 +3459,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -3486,6 +3481,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -3493,23 +3489,24 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -3517,8 +3514,9 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -3526,9 +3524,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -3547,8 +3542,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W32-MUBUF-NEXT: s_clause 0x1 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -3576,6 +3569,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -3597,6 +3591,7 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -3604,23 +3599,24 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -3628,8 +3624,9 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -3637,9 +3634,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -3658,8 +3652,6 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: s_clause 0x1 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4048,43 +4040,42 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 +; GFX11-FLATSCR-NEXT: s_clause 0x3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -4099,7 +4090,8 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -4125,29 +4117,15 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s0 ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 @@ -4155,8 +4133,27 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -4166,40 +4163,49 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 +; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -4208,19 +4214,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; SI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 -; SI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -4235,7 +4228,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 -; SI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4250,70 +4242,84 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: s_mov_b32 s7, 0xe80000 ; VI-NEXT: s_add_u32 s4, s4, s0 ; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:320 +; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:272 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 +; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:200 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:308 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:220 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:800 +; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:776 @@ -4321,19 +4327,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:312 -; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:280 -; VI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[4:7], 0 offset:224 -; VI-NEXT: buffer_store_dword v15, off, s[4:7], 0 offset:216 -; VI-NEXT: buffer_store_dword v14, off, s[4:7], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0 offset:752 @@ -4348,7 +4341,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; VI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[4:7], 0 offset:708 -; VI-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4362,69 +4354,83 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800 +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 @@ -4432,18 +4438,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752 @@ -4458,8 +4452,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4487,6 +4479,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -4508,6 +4501,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -4515,23 +4509,24 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -4539,8 +4534,9 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -4548,9 +4544,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -4569,8 +4562,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W32-MUBUF-NEXT: s_clause 0x1 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -4598,6 +4589,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284 @@ -4619,6 +4611,7 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240 @@ -4626,23 +4619,24 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824 @@ -4650,8 +4644,9 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792 @@ -4659,9 +4654,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768 @@ -4680,8 +4672,6 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: s_clause 0x1 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 @@ -5070,43 +5060,42 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v26, v17 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 +; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356 ; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v25, 0x3f20e7f5 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 +; GFX11-FLATSCR-NEXT: s_clause 0x3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -5121,7 +5110,8 @@ define amdgpu_gs float @gs_main(i32 %idx) { ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -5151,29 +5141,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s6 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 @@ -5181,8 +5157,27 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -5192,40 +5187,49 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 +; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5234,19 +5238,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -5261,7 +5252,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 -; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; SI-NEXT: s_mov_b32 s2, s5 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -5277,70 +5267,84 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: s_mov_b32 s11, 0xe80000 ; VI-NEXT: s_add_u32 s8, s8, s6 ; VI-NEXT: s_addc_u32 s9, s9, 0 -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 +; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 -; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 -; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 -; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 -; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 ; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 +; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 +; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 +; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 ; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen +; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 +; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 +; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 +; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 +; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5348,19 +5352,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -5375,7 +5366,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 -; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; VI-NEXT: s_mov_b32 s2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5390,69 +5380,83 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -5460,18 +5464,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -5486,9 +5478,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen -; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen +; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-MUBUF-NEXT: ; return to shader part epilog @@ -5500,10 +5491,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -5514,6 +5505,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -5535,6 +5528,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -5542,25 +5536,24 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -5568,8 +5561,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -5577,8 +5571,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -5597,8 +5589,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 -; GFX10_W32-MUBUF-NEXT: s_clause 0x1 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -5612,10 +5602,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -5626,6 +5616,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -5647,6 +5639,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -5654,25 +5647,24 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -5680,8 +5672,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -5689,8 +5682,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -5709,8 +5700,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: s_clause 0x1 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -6104,10 +6093,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 @@ -6116,31 +6105,29 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 +; GFX11-FLATSCR-NEXT: s_clause 0x3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -6155,7 +6142,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 @@ -6184,29 +6172,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s6 ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 +; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 @@ -6214,8 +6188,27 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356 @@ -6225,40 +6218,49 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 -; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 -; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 -; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 +; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen +; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 +; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 +; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 +; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499 +; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 +; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6267,19 +6269,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 -; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -6294,7 +6283,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 -; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; SI-NEXT: s_mov_b32 s2, s5 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -6310,70 +6298,84 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: s_mov_b32 s11, 0xe80000 ; VI-NEXT: s_add_u32 s8, s8, s6 ; VI-NEXT: s_addc_u32 s9, s9, 0 -; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 +; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 +; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 +; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 +; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6381,19 +6383,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 -; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -6408,7 +6397,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 -; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen ; VI-NEXT: s_mov_b32 s2, s5 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6423,69 +6411,83 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000 ; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5 ; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c -; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320 +; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 +; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 ; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296 ; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3 +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f +; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 +; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 +; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 +; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 +; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 +; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 +; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4 +; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 +; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen +; GFX9-MUBUF-NEXT: s_nop 0 ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832 ; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828 ; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824 ; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499 -; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 -; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 -; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c ; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816 ; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812 ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800 +; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796 +; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792 +; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788 +; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780 ; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 @@ -6493,18 +6495,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764 ; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2 -; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0 -; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316 -; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312 -; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280 -; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268 -; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248 -; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240 -; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232 -; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228 -; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224 -; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 -; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196 ; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760 ; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756 ; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752 @@ -6519,9 +6509,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716 ; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712 ; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708 -; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen -; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen +; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-MUBUF-NEXT: ; return to shader part epilog @@ -6533,10 +6522,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -6547,6 +6536,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -6568,6 +6559,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -6575,25 +6567,24 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 +; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -6601,8 +6592,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -6610,8 +6602,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -6630,8 +6620,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 -; GFX10_W32-MUBUF-NEXT: s_clause 0x1 -; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -6645,10 +6633,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1 ; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3 ; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37 @@ -6659,6 +6647,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284 @@ -6680,6 +6670,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f +; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240 @@ -6687,25 +6678,24 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3 -; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39 +; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4 -; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5 -; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196 +; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499 +; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 +; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824 @@ -6713,8 +6703,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c -; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 +; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792 @@ -6722,8 +6713,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780 ; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808 -; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768 @@ -6742,8 +6731,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712 ; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708 -; GFX10_W64-MUBUF-NEXT: s_clause 0x1 -; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen ; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5 ; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0) @@ -7137,10 +7124,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288 @@ -7149,31 +7136,29 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6 -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758 -; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13 ; GFX11-FLATSCR-NEXT: s_clause 0x1 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4 -; GFX11-FLATSCR-NEXT: s_clause 0x1 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 -; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c -; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0 -; GFX11-FLATSCR-NEXT: s_clause 0x1 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v33, v22 +; GFX11-FLATSCR-NEXT: s_clause 0x3 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208 +; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192 ; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15 ; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0 @@ -7188,7 +7173,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784 ; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6 -; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2 +; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13 +; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2 ; GFX11-FLATSCR-NEXT: s_clause 0x4 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768 ; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752 diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll index c90d7887f2ff6..71e4755b58bf2 100644 --- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll +++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -3,6 +3,9 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) { ; CHECK-LABEL: excess_soft_clause_reg_pressure: ; CHECK: BB0_1: ; %for.cond28.preheader +; CHECK: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 + ; CHECK: global_load_dword ; CHECK-NEXT: global_load_dword ; CHECK-NEXT: global_load_dword @@ -15,23 +18,11 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspa ; CHECK-NOT: v_readlane_b32 ; CHECK: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 - -; CHECK-NOT: v_writelane_b32 -; CHECK-NOT: v_readlane_b32 - ; CHECK: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 - -; CHECK-NOT: v_writelane_b32 -; CHECK-NOT: v_readlane_b32 - ; CHECK: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 ; CHECK-NOT: v_writelane_b32 ; CHECK-NOT: v_readlane_b32 - entry: %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %i2 = load i64, ptr addrspace(4) %i, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index 1a0f75e048cb9..da48af100d27b 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -448,13 +448,13 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3] +; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] ; GFX90A-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload ; GFX90A-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index b5474b8974b29..50056b62b3397 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10314,8 +10314,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 @@ -10328,10 +10327,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:160 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:144 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128 ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112 @@ -10343,9 +10344,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 @@ -10359,7 +10358,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:32 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:16 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39] ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 @@ -10466,13 +10468,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192 ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3) +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:160 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 802de8037cf6b..9cb22dad86b88 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -295,9 +295,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v40, s16, 2 +; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: v_writelane_b32 v40, s34, 3 ; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34 diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir index a1771f9356014..ee2e58f2a6cc1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir @@ -98,29 +98,28 @@ body: | ; CHECK-LABEL: name: foo ; CHECK: liveins: $q0, $r0, $r1, $r2, $lr - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK-NEXT: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $r7 - ; CHECK-NEXT: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2) - ; CHECK-NEXT: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1) - ; CHECK-NEXT: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0) - ; CHECK-NEXT: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr :: (load (s128) from %ir.src, align 4), (store (s128) into %ir.dest, align 4), (load (s128) from %ir.src2, align 4), (store (s128) into %ir.dest2, align 4) { - ; CHECK-NEXT: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr - ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4) - ; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4) - ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4) - ; CHECK-NEXT: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4) - ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 :: (load (s128) from %ir.src3, align 4), (store (s128) into %ir.dest3, align 4) { - ; CHECK-NEXT: MVE_VPST 4, implicit $vpr - ; CHECK-NEXT: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4) - ; CHECK-NEXT: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4) - ; CHECK-NEXT: } - ; CHECK-NEXT: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0 + ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2) + ; CHECK: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1) + ; CHECK: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0) + ; CHECK: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr { + ; CHECK: MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr + ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4) + ; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4) + ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4) + ; CHECK: MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4) + ; CHECK: } + ; CHECK: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 { + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4) + ; CHECK: MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4) + ; CHECK: } + ; CHECK: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0 $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr frame-setup CFI_INSTRUCTION def_cfa_offset 8 frame-setup CFI_INSTRUCTION offset $lr, -4 From fb49adb6ea8f5b476b42a118aac758006f0a4152 Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.foad@amd.com> Date: Wed, 5 Nov 2025 10:17:52 +0000 Subject: [PATCH 296/313] [AMDGPU] Another test for missing S_WAIT_XCNT (#166154) --- llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index a1381ecad81e2..f964480dcc633 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -1069,6 +1069,51 @@ body: | $sgpr0 = S_MOV_B32 $sgpr0 ... +# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0. +--- +name: mixed_pending_events +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: mixed_pending_events + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr2, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_LOADCNT 1 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $sgpr2 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + bb.2: + liveins: $sgpr2, $vgpr2 + $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + $sgpr2 = S_MOV_B32 $sgpr2 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec +... + --- name: pending_vmem_event_between_block tracksRegLiveness: true From 28e024fb9d42fd44e0ad70638da0b6a12ba544dc Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <llvm-dev@redking.me.uk> Date: Wed, 5 Nov 2025 10:33:30 +0000 Subject: [PATCH 297/313] [X86] narrowBitOpRMW - allow additional uses of the BTC/R/S result (#166376) If there are additional uses of the bit twiddled value as well as the rmw store, we can replace them with a (re)loaded copy of the full width integer value after the store. There's some memory op chain handling to handle here - the additional (re)load is chained after the new store and then any dependencies of the original store are chained after the (re)load. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 17 +- llvm/test/CodeGen/X86/bittest-big-integer.ll | 401 ++++--------------- 2 files changed, 87 insertions(+), 331 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 06b8f7614bffd..4d44227b3ecd4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53370,8 +53370,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, // // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) SDValue SrcVal, InsertBit, ShAmt; - if (!StoredVal.hasOneUse() || - !(sd_match(StoredVal, m_And(m_Value(SrcVal), + if (!(sd_match(StoredVal, m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || sd_match(StoredVal, m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || @@ -53442,8 +53441,18 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); } - return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), - Align(), St->getMemOperand()->getFlags()); + SDValue NewStore = + DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), + Align(), St->getMemOperand()->getFlags()); + + // If there are other uses of StoredVal, replace with a new load of the + // whole (updated) value. + if (!StoredVal.hasOneUse()) { + SDValue NewLoad = + DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand()); + DAG.ReplaceAllUsesWith(StoredVal, NewLoad); + } + return NewStore; } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index bcb14fd25b975..32d225273a6e1 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -906,115 +906,46 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_cmpz_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $64, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %esi -; X86-NEXT: movl 36(%esp,%esi), %eax -; X86-NEXT: movl 40(%esp,%esi), %edi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl 32(%esp,%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esp,%esi), %esi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: xorl 12(%ecx), %esi -; X86-NEXT: xorl 8(%ecx), %edx -; X86-NEXT: xorl 4(%ecx), %eax -; X86-NEXT: xorl (%ecx), %edi -; X86-NEXT: movl %edx, 8(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %edi, (%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: andl $96, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: xorl %edx, (%eax,%ecx) +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: orl 12(%eax), %edx +; X86-NEXT: orl 8(%eax), %ecx +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: complement_cmpz_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx ; SSE-NEXT: movl $1, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %rsi, %rax -; SSE-NEXT: xorq 8(%rdi), %rdx -; SSE-NEXT: xorq (%rdi), %rax -; SSE-NEXT: movq %rax, (%rdi) -; SSE-NEXT: movq %rdx, 8(%rdi) -; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: shll %cl, %eax +; SSE-NEXT: andl $96, %ecx +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: xorl %eax, (%rdi,%rcx) +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: orq 8(%rdi), %rax ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; -; AVX2-LABEL: complement_cmpz_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shlxq %rcx, %rax, %rax -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rax, %rdx -; AVX2-NEXT: cmovneq %rsi, %rax -; AVX2-NEXT: xorq 8(%rdi), %rdx -; AVX2-NEXT: xorq (%rdi), %rax -; AVX2-NEXT: movq %rax, (%rdi) -; AVX2-NEXT: movq %rdx, 8(%rdi) -; AVX2-NEXT: orq %rdx, %rax -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq -; -; AVX512-LABEL: complement_cmpz_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movl $1, %edx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %rdx, %rsi -; AVX512-NEXT: shlxq %rcx, %rdx, %rdx -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rdx, %rsi -; AVX512-NEXT: cmovneq %rax, %rdx -; AVX512-NEXT: xorq 8(%rdi), %rsi -; AVX512-NEXT: xorq (%rdi), %rdx -; AVX512-NEXT: movq %rdx, (%rdi) -; AVX512-NEXT: movq %rsi, 8(%rdi) -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: setne %al -; AVX512-NEXT: retq +; AVX-LABEL: complement_cmpz_i128: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $esi killed $esi def $rsi +; AVX-NEXT: movl $1, %eax +; AVX-NEXT: shlxl %esi, %eax, %eax +; AVX-NEXT: andl $96, %esi +; AVX-NEXT: shrl $3, %esi +; AVX-NEXT: xorl %eax, (%rdi,%rsi) +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: orq 8(%rdi), %rax +; AVX-NEXT: setne %al +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -1088,247 +1019,63 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind ; X86-LABEL: chain_reset_i256: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $112, %esp -; X86-NEXT: movzbl 20(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $28, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 72(%esp,%eax), %edx -; X86-NEXT: movl 76(%esp,%eax), %edi -; X86-NEXT: movl %edi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 80(%esp,%eax), %edx -; X86-NEXT: movl 84(%esp,%eax), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: movl 64(%esp,%eax), %edi -; X86-NEXT: movl 88(%esp,%eax), %esi -; X86-NEXT: movl 92(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: notl %ebx -; X86-NEXT: notl %eax -; X86-NEXT: notl %edx -; X86-NEXT: notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: notl %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: andl 12(%ecx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 8(%ecx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 20(%ecx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl 16(%ecx), %edx -; X86-NEXT: andl 28(%ecx), %eax -; X86-NEXT: andl 24(%ecx), %ebx -; X86-NEXT: andl 4(%ecx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl (%ecx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, 24(%ecx) -; X86-NEXT: movl %eax, 28(%ecx) -; X86-NEXT: movl %edx, 16(%ecx) -; X86-NEXT: movl %edi, 20(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 12(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 4(%ecx) -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl (%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-2, %edi +; X86-NEXT: roll %cl, %edi +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: andl $28, %ecx +; X86-NEXT: andl %edi, (%esi,%ecx) +; X86-NEXT: movl 8(%esi), %ebx +; X86-NEXT: movl (%esi), %edi +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 12(%esi), %ebp +; X86-NEXT: orl 28(%esi), %ebp +; X86-NEXT: orl 20(%esi), %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl 24(%esi), %ebx +; X86-NEXT: movl 16(%esi), %ebp +; X86-NEXT: orl %edi, %ebp +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl %edi, (%edx) ; X86-NEXT: movl (%eax), %eax +; X86-NEXT: orl %ecx, %ebp ; X86-NEXT: jne .LBB23_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: .LBB23_2: -; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: chain_reset_i256: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: shrb $3, %al -; SSE-NEXT: andb $24, %al -; SSE-NEXT: negb %al -; SSE-NEXT: movsbq %al, %r10 -; SSE-NEXT: movq -24(%rsp,%r10), %r8 -; SSE-NEXT: movq -16(%rsp,%r10), %rax -; SSE-NEXT: shldq %cl, %r8, %rax -; SSE-NEXT: movq -32(%rsp,%r10), %r9 -; SSE-NEXT: shldq %cl, %r9, %r8 -; SSE-NEXT: movq -40(%rsp,%r10), %r10 -; SSE-NEXT: shldq %cl, %r10, %r9 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %r10 -; SSE-NEXT: notq %r8 -; SSE-NEXT: notq %rax -; SSE-NEXT: notq %r10 -; SSE-NEXT: notq %r9 -; SSE-NEXT: andq 24(%rdi), %rax -; SSE-NEXT: andq 16(%rdi), %r8 -; SSE-NEXT: andq 8(%rdi), %r9 -; SSE-NEXT: andq (%rdi), %r10 -; SSE-NEXT: movq %r8, 16(%rdi) -; SSE-NEXT: movq %rax, 24(%rdi) -; SSE-NEXT: movq %r10, (%rdi) -; SSE-NEXT: movq %r9, 8(%rdi) -; SSE-NEXT: orq %rax, %r9 -; SSE-NEXT: orq %r10, %r8 -; SSE-NEXT: movl (%rsi), %eax -; SSE-NEXT: movl %r10d, (%rsi) -; SSE-NEXT: movl (%rdx), %ecx -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: cmovnel %ecx, %eax -; SSE-NEXT: retq -; -; AVX2-LABEL: chain_reset_i256: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $24, %al -; AVX2-NEXT: negb %al -; AVX2-NEXT: movsbq %al, %rax -; AVX2-NEXT: movq -32(%rsp,%rax), %r8 -; AVX2-NEXT: movq -24(%rsp,%rax), %r9 -; AVX2-NEXT: movq %r9, %r10 -; AVX2-NEXT: shldq %cl, %r8, %r10 -; AVX2-NEXT: movq -40(%rsp,%rax), %r11 -; AVX2-NEXT: movq -16(%rsp,%rax), %rax -; AVX2-NEXT: shldq %cl, %r9, %rax -; AVX2-NEXT: shldq %cl, %r11, %r8 -; AVX2-NEXT: andnq 24(%rdi), %rax, %rax -; AVX2-NEXT: andnq 16(%rdi), %r10, %r9 -; AVX2-NEXT: andnq 8(%rdi), %r8, %r8 -; AVX2-NEXT: shlxq %rcx, %r11, %rcx -; AVX2-NEXT: andnq (%rdi), %rcx, %rcx -; AVX2-NEXT: movq %r9, 16(%rdi) -; AVX2-NEXT: movq %rax, 24(%rdi) -; AVX2-NEXT: movq %rcx, (%rdi) -; AVX2-NEXT: movq %r8, 8(%rdi) -; AVX2-NEXT: orq %rax, %r8 -; AVX2-NEXT: orq %rcx, %r9 -; AVX2-NEXT: movl (%rsi), %eax -; AVX2-NEXT: movl %ecx, (%rsi) -; AVX2-NEXT: movl (%rdx), %ecx -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: orq %r8, %r9 -; AVX2-NEXT: cmovnel %ecx, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: chain_reset_i256: -; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: # kill: def $ecx killed $ecx def $rcx -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: shrb $3, %al -; AVX512-NEXT: andb $24, %al -; AVX512-NEXT: negb %al -; AVX512-NEXT: movsbq %al, %rax -; AVX512-NEXT: movq -40(%rsp,%rax), %r8 -; AVX512-NEXT: movq -32(%rsp,%rax), %r9 -; AVX512-NEXT: movq -24(%rsp,%rax), %r10 -; AVX512-NEXT: movq %r10, %r11 -; AVX512-NEXT: shldq %cl, %r9, %r11 -; AVX512-NEXT: movq -16(%rsp,%rax), %rax -; AVX512-NEXT: shldq %cl, %r10, %rax -; AVX512-NEXT: shlxq %rcx, %r8, %r10 -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %r8, %r9 -; AVX512-NEXT: andnq 24(%rdi), %rax, %rax -; AVX512-NEXT: andnq 16(%rdi), %r11, %rcx -; AVX512-NEXT: andnq 8(%rdi), %r9, %r8 -; AVX512-NEXT: andnq (%rdi), %r10, %r9 -; AVX512-NEXT: movq %rcx, 16(%rdi) -; AVX512-NEXT: movq %rax, 24(%rdi) -; AVX512-NEXT: movq %r9, (%rdi) -; AVX512-NEXT: movq %r8, 8(%rdi) -; AVX512-NEXT: orq %rax, %r8 -; AVX512-NEXT: orq %r9, %rcx -; AVX512-NEXT: movl (%rsi), %eax -; AVX512-NEXT: movl %r9d, (%rsi) -; AVX512-NEXT: movl (%rdx), %edx -; AVX512-NEXT: addl %edx, %eax -; AVX512-NEXT: orq %r8, %rcx -; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; X64-LABEL: chain_reset_i256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: movl $-2, %eax +; X64-NEXT: roll %cl, %eax +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $28, %ecx +; X64-NEXT: andl %eax, (%rdi,%rcx) +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq 8(%rdi), %r8 +; X64-NEXT: orq 24(%rdi), %r8 +; X64-NEXT: movq 16(%rdi), %rdi +; X64-NEXT: orq %rcx, %rdi +; X64-NEXT: movl (%rsi), %eax +; X64-NEXT: movl %ecx, (%rsi) +; X64-NEXT: movl (%rdx), %ecx +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: orq %r8, %rdi +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq %rem = and i32 %position, 255 %ofs = zext nneg i32 %rem to i256 %bit = shl nuw i256 1, %ofs From 86d9e1c035fe14a18751523108ffdef092910b14 Mon Sep 17 00:00:00 2001 From: SKill <skill@google.com> Date: Wed, 5 Nov 2025 11:45:32 +0100 Subject: [PATCH 298/313] [clang] Delete duplicate code in sourcemanager (#166236) Now that the `SourceManager::getExpansionLoc` and `SourceManager::getSpellingLoc` functions are efficient, delete unnecessary code duplicate in `SourceManager::getDecomposedExpansionLoc` and `SourceManager::getDecomposedSpellingLoc` methods. --- clang/include/clang/Basic/SourceManager.h | 25 ++-------------- clang/lib/Basic/SourceManager.cpp | 35 ----------------------- 2 files changed, 2 insertions(+), 58 deletions(-) diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index ed967fd47dc83..6d9d074d78026 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -1286,16 +1286,7 @@ class SourceManager : public RefCountedBase<SourceManager> { /// If the location is an expansion record, walk through it until we find /// the final location expanded. FileIDAndOffset getDecomposedExpansionLoc(SourceLocation Loc) const { - FileID FID = getFileID(Loc); - auto *E = getSLocEntryOrNull(FID); - if (!E) - return std::make_pair(FileID(), 0); - - unsigned Offset = Loc.getOffset()-E->getOffset(); - if (Loc.isFileID()) - return std::make_pair(FID, Offset); - - return getDecomposedExpansionLocSlowCase(E); + return getDecomposedLoc(getExpansionLoc(Loc)); } /// Decompose the specified location into a raw FileID + Offset pair. @@ -1303,15 +1294,7 @@ class SourceManager : public RefCountedBase<SourceManager> { /// If the location is an expansion record, walk through it until we find /// its spelling record. FileIDAndOffset getDecomposedSpellingLoc(SourceLocation Loc) const { - FileID FID = getFileID(Loc); - auto *E = getSLocEntryOrNull(FID); - if (!E) - return std::make_pair(FileID(), 0); - - unsigned Offset = Loc.getOffset()-E->getOffset(); - if (Loc.isFileID()) - return std::make_pair(FID, Offset); - return getDecomposedSpellingLocSlowCase(E, Offset); + return getDecomposedLoc(getSpellingLoc(Loc)); } /// Returns the "included/expanded in" decomposed location of the given @@ -1979,10 +1962,6 @@ class SourceManager : public RefCountedBase<SourceManager> { SourceLocation getSpellingLocSlowCase(SourceLocation Loc) const; SourceLocation getFileLocSlowCase(SourceLocation Loc) const; - FileIDAndOffset - getDecomposedExpansionLocSlowCase(const SrcMgr::SLocEntry *E) const; - FileIDAndOffset getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E, - unsigned Offset) const; void computeMacroArgsCache(MacroArgsMap &MacroArgsCache, FileID FID) const; void associateFileChunkWithMacroArgExp(MacroArgsMap &MacroArgsCache, FileID FID, diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 97aa0f2aa59b9..7dc81c50f87a2 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -928,41 +928,6 @@ SourceLocation SourceManager::getFileLocSlowCase(SourceLocation Loc) const { return Loc; } -FileIDAndOffset SourceManager::getDecomposedExpansionLocSlowCase( - const SrcMgr::SLocEntry *E) const { - // If this is an expansion record, walk through all the expansion points. - FileID FID; - SourceLocation Loc; - unsigned Offset; - do { - Loc = E->getExpansion().getExpansionLocStart(); - - FID = getFileID(Loc); - E = &getSLocEntry(FID); - Offset = Loc.getOffset()-E->getOffset(); - } while (!Loc.isFileID()); - - return std::make_pair(FID, Offset); -} - -FileIDAndOffset -SourceManager::getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E, - unsigned Offset) const { - // If this is an expansion record, walk through all the expansion points. - FileID FID; - SourceLocation Loc; - do { - Loc = E->getExpansion().getSpellingLoc(); - Loc = Loc.getLocWithOffset(Offset); - - FID = getFileID(Loc); - E = &getSLocEntry(FID); - Offset = Loc.getOffset()-E->getOffset(); - } while (!Loc.isFileID()); - - return std::make_pair(FID, Offset); -} - /// getImmediateSpellingLoc - Given a SourceLocation object, return the /// spelling location referenced by the ID. This is the first level down /// towards the place where the characters that make up the lexed token can be From a65867ac319b0fbf5891b39df3c7b660070d63bd Mon Sep 17 00:00:00 2001 From: Elvina Yakubova <eyakubova@nvidia.com> Date: Wed, 5 Nov 2025 10:51:31 +0000 Subject: [PATCH 299/313] [BOLT][AArch64] Fix search to proceed upwards from memcpy call (#166182) The search should proceed from CallInst to the beginning of BB since X2 can be rewritten and we need to catch the most recent write before the call. Patch by Yafet Beyene alulayafet@gmail.com --- bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 2 +- bolt/test/runtime/AArch64/inline-memcpy.s | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 8a496c566b06b..57db6a436c5c6 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2804,7 +2804,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { BitVector WrittenRegs(RegInfo->getNumRegs()); const BitVector &SizeRegAliases = getAliases(SizeReg); - for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) { + for (auto InstIt = CallInst; InstIt != BB.begin(); --InstIt) { const MCInst &Inst = *InstIt; WrittenRegs.reset(); getWrittenRegs(Inst, WrittenRegs); diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index dc59a08b889a7..badff299603a0 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -7,7 +7,7 @@ # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM -# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls) +# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 17 total calls) # CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls # Each function should use optimal size-specific instructions and NO memcpy calls @@ -84,6 +84,9 @@ # CHECK-ASM-LABEL: <test_register_move_negative>: # CHECK-ASM: bl{{.*}}<memcpy +# CHECK-ASM-LABEL: <test_x2_rewrite_negative>: +# CHECK-ASM: bl{{.*}}<memcpy + # Live-in parameter should NOT be inlined (size unknown at compile time) # CHECK-ASM-LABEL: <test_live_in_negative>: # CHECK-ASM: bl{{.*}}<memcpy @@ -273,6 +276,15 @@ test_register_move_negative: ret .size test_register_move_negative, .-test_register_move_negative + .globl test_x2_rewrite_negative + .type test_x2_rewrite_negative,@function +test_x2_rewrite_negative: + mov x2, #8 + ldr x2, [sp, #24] + bl memcpy + ret + .size test_x2_rewrite_negative, .-test_x2_rewrite_negative + .globl test_live_in_negative .type test_live_in_negative,@function test_live_in_negative: From 5821b09e5f348cb750d4b9cc2f532ca34cd8a6c6 Mon Sep 17 00:00:00 2001 From: Timm Baeder <tbaeder@redhat.com> Date: Wed, 5 Nov 2025 11:52:59 +0100 Subject: [PATCH 300/313] [clang][bytecode] Print primitive arrays in Descriptor::dumpFull() (#166393) And recurse into records properly. --- clang/lib/AST/ByteCode/Disasm.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp index fd0903f2e652c..638028f84ff24 100644 --- a/clang/lib/AST/ByteCode/Disasm.cpp +++ b/clang/lib/AST/ByteCode/Disasm.cpp @@ -436,8 +436,28 @@ LLVM_DUMP_METHOD void Descriptor::dumpFull(unsigned Offset, FO += ElemDesc->getAllocSize(); } + } else if (isPrimitiveArray()) { + OS.indent(Spaces) << "Elements: " << getNumElems() << '\n'; + OS.indent(Spaces) << "Element type: " << primTypeToString(getPrimType()) + << '\n'; + unsigned FO = Offset + sizeof(InitMapPtr); + for (unsigned I = 0; I != getNumElems(); ++I) { + OS.indent(Spaces) << "Element " << I << " offset: " << FO << '\n'; + FO += getElemSize(); + } } else if (isRecord()) { ElemRecord->dump(OS, Indent + 1, Offset); + unsigned I = 0; + for (const Record::Field &F : ElemRecord->fields()) { + OS.indent(Spaces) << "- Field " << I << ": "; + { + ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_RED, true}); + OS << F.Decl->getName(); + } + OS << ". Offset " << (Offset + F.Offset) << "\n"; + F.Desc->dumpFull(Offset + F.Offset, Indent + 1); + ++I; + } } else if (isPrimitive()) { } else { } From d249e67a6a7c36a2ffcf08839810a8efa4603f18 Mon Sep 17 00:00:00 2001 From: Victor Campos <victor.campos@arm.com> Date: Wed, 5 Nov 2025 11:14:02 +0000 Subject: [PATCH 301/313] [libc][math] Disable `FEnvSafeTest.cpp` if AArch64 target has no FP support (#166370) The `FEnvSafeTest.cpp` test fails on AArch64 soft nofp configurations because LLVM libc does not provide a floating-point environment in these configurations. This patch adds another preprocessor guard on `__ARM_FP` to disable the test on those. --- libc/test/UnitTest/FEnvSafeTest.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libc/test/UnitTest/FEnvSafeTest.cpp b/libc/test/UnitTest/FEnvSafeTest.cpp index 4393f9d5e5c3b..64f50d7be7fe3 100644 --- a/libc/test/UnitTest/FEnvSafeTest.cpp +++ b/libc/test/UnitTest/FEnvSafeTest.cpp @@ -43,7 +43,8 @@ void FEnvSafeTest::set_fenv(const fenv_t &fenv) { void FEnvSafeTest::expect_fenv_eq(const fenv_t &before_fenv, const fenv_t &after_fenv) { -#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC) +#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC) && \ + defined(__ARM_FP) using FPState = LIBC_NAMESPACE::fputil::FEnv::FPState; const FPState &before_state = reinterpret_cast<const FPState &>(before_fenv); const FPState &after_state = reinterpret_cast<const FPState &>(after_fenv); From fedd3b0399e78f47c5795f436b88722319832172 Mon Sep 17 00:00:00 2001 From: Timm Baeder <tbaeder@redhat.com> Date: Wed, 5 Nov 2025 12:15:48 +0100 Subject: [PATCH 302/313] [clang][bytecode] Remove dummy variables once they are proper globals (#166174) Dummy variables have an entry in `Program::Globals`, but they are not added to `GlobalIndices`. When registering redeclarations, we used to only patch up the global indices, but that left the dummy variables alone. Update the dummy variables of all redeclarations as well. Fixes https://github.com/llvm/llvm-project/issues/165952 --- clang/lib/AST/ByteCode/Program.cpp | 29 +++++++++++++++++++++++++---- clang/lib/AST/ByteCode/Program.h | 1 - clang/test/AST/ByteCode/records.cpp | 11 +++++++++++ 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp index e0b2852f0e906..2425373ab2ef8 100644 --- a/clang/lib/AST/ByteCode/Program.cpp +++ b/clang/lib/AST/ByteCode/Program.cpp @@ -218,21 +218,42 @@ UnsignedOrNone Program::createGlobal(const ValueDecl *VD, const Expr *Init) { return std::nullopt; Global *NewGlobal = Globals[*Idx]; + // Note that this loop has one iteration where Redecl == VD. for (const Decl *Redecl : VD->redecls()) { - unsigned &PIdx = GlobalIndices[Redecl]; + + // If this redecl was registered as a dummy variable, it is now a proper + // global variable and points to the block we just created. + if (auto DummyIt = DummyVariables.find(Redecl); + DummyIt != DummyVariables.end()) { + assert(!Globals[DummyIt->second]->block()->hasPointers()); + Globals[DummyIt->second] = NewGlobal; + DummyVariables.erase(DummyIt); + } + // If the redeclaration hasn't been registered yet at all, we just set its + // global index to Idx. If it has been registered yet, it might have + // pointers pointing to it and we need to transfer those pointers to the new + // block. + auto [Iter, Inserted] = GlobalIndices.try_emplace(Redecl); + if (Inserted) { + GlobalIndices[Redecl] = *Idx; + continue; + } + if (Redecl != VD) { - if (Block *RedeclBlock = Globals[PIdx]->block(); + if (Block *RedeclBlock = Globals[Iter->second]->block(); RedeclBlock->isExtern()) { - Globals[PIdx] = NewGlobal; + // All pointers pointing to the previous extern decl now point to the // new decl. // A previous iteration might've already fixed up the pointers for this // global. if (RedeclBlock != NewGlobal->block()) RedeclBlock->movePointersTo(NewGlobal->block()); + + Globals[Iter->second] = NewGlobal; } } - PIdx = *Idx; + Iter->second = *Idx; } return *Idx; diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h index 28fcc97f5339d..cc9127dc77860 100644 --- a/clang/lib/AST/ByteCode/Program.h +++ b/clang/lib/AST/ByteCode/Program.h @@ -205,7 +205,6 @@ class Program final { const Block *block() const { return &B; } private: - /// Required metadata - does not actually track pointers. Block B; }; diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp index 83f32c97c50c7..4799ebe25dde1 100644 --- a/clang/test/AST/ByteCode/records.cpp +++ b/clang/test/AST/ByteCode/records.cpp @@ -1882,3 +1882,14 @@ namespace MethodWillHaveBody { } int n = f(0); // both-note {{instantiation of}} } + +namespace StaticRedecl { + struct T { + static T tt; + constexpr T() : p(&tt) {} + T *p; + }; + T T::tt; + constexpr T t; + static_assert(t.p == &T::tt, ""); +} From e8564830c19e6fd5bfa38488c06f332b214ea858 Mon Sep 17 00:00:00 2001 From: Congcong Cai <congcongcai0907@163.com> Date: Wed, 5 Nov 2025 19:25:53 +0800 Subject: [PATCH 303/313] [clang-tidy][doc] add more information in twine-local's document (#166266) explain more about use-after-free in llvm-twine-local add note about manually adjusting code after applying fix-it. fixed: #154810 --- .../clang-tidy/checks/llvm/twine-local.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst index ec9ef1c60913c..6c994a48d83de 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst @@ -14,3 +14,21 @@ should be generally avoided. // becomes static std::string Moo = (Twine("bark") + "bah").str(); + +The ``Twine`` does not own the memory of its contents, so it is not +recommended to use ``Twine`` created from temporary strings or string literals. + +.. code-block:: c++ + + static Twine getModuleIdentifier(StringRef moduleName) { + return moduleName + "_module"; + } + void foo() { + Twine result = getModuleIdentifier(std::string{"abc"} + "def"); + // temporary std::string is destroyed here, result is dangling + } + +After applying this fix-it hints, the code will use ``std::string`` instead of +``Twine`` for local variables. However, ``Twine`` has lots of methods that +are incompatible with ``std::string``, so the user may need to adjust the code +manually after applying the fix-it hints. From c1dc064ba063f0d679f1a6d6aeef99becea8b709 Mon Sep 17 00:00:00 2001 From: Sirui Mu <msrlancern@gmail.com> Date: Wed, 5 Nov 2025 19:32:34 +0800 Subject: [PATCH 304/313] [CIR] Add support for storing into _Atomic variables (#165872) --- clang/include/clang/CIR/MissingFeatures.h | 2 + clang/lib/CIR/CodeGen/CIRGenAtomic.cpp | 121 +++++++++++++++++++++- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 14 ++- clang/lib/CIR/CodeGen/CIRGenFunction.h | 7 +- clang/test/CIR/CodeGen/atomic.c | 26 +++++ 5 files changed, 162 insertions(+), 8 deletions(-) diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 48ef8be9fb782..6f099a7027a10 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -180,6 +180,8 @@ struct MissingFeatures { static bool atomicSyncScopeID() { return false; } static bool atomicTypes() { return false; } static bool atomicUseLibCall() { return false; } + static bool atomicMicrosoftVolatile() { return false; } + static bool atomicOpenMP() { return false; } // Global ctor handling static bool globalCtorLexOrder() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp index 7db6e283ec0a5..cd4c1f0e5b769 100644 --- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp @@ -27,6 +27,7 @@ class AtomicInfo { CharUnits atomicAlign; CharUnits valueAlign; TypeEvaluationKind evaluationKind = cir::TEK_Scalar; + bool useLibCall = true; LValue lvalue; mlir::Location loc; @@ -62,8 +63,8 @@ class AtomicInfo { assert(!cir::MissingFeatures::atomicInfo()); cgf.cgm.errorNYI(loc, "AtomicInfo: non-simple lvalue"); } - - assert(!cir::MissingFeatures::atomicUseLibCall()); + useLibCall = !ctx.getTargetInfo().hasBuiltinAtomic( + atomicSizeInBits, ctx.toBits(lvalue.getAlignment())); } QualType getValueType() const { return valueTy; } @@ -75,6 +76,8 @@ class AtomicInfo { assert(!cir::MissingFeatures::atomicInfoGetAtomicPointer()); return nullptr; } + bool shouldUseLibCall() const { return useLibCall; } + const LValue &getAtomicLValue() const { return lvalue; } Address getAtomicAddress() const { mlir::Type elemTy; if (lvalue.isSimple()) { @@ -96,6 +99,8 @@ class AtomicInfo { bool emitMemSetZeroIfNecessary() const; + mlir::Value getScalarRValValueOrNull(RValue rvalue) const; + /// Cast the given pointer to an integer pointer suitable for atomic /// operations on the source. Address castToAtomicIntPointer(Address addr) const; @@ -105,6 +110,9 @@ class AtomicInfo { /// copy the value across. Address convertToAtomicIntPointer(Address addr) const; + /// Converts a rvalue to integer value. + mlir::Value convertRValueToInt(RValue rvalue, bool cmpxchg = false) const; + /// Copy an atomic r-value into atomic-layout memory. void emitCopyIntoMemory(RValue rvalue) const; @@ -195,6 +203,12 @@ Address AtomicInfo::createTempAlloca() const { return tempAlloca; } +mlir::Value AtomicInfo::getScalarRValValueOrNull(RValue rvalue) const { + if (rvalue.isScalar() && (!hasPadding() || !lvalue.isSimple())) + return rvalue.getValue(); + return nullptr; +} + Address AtomicInfo::castToAtomicIntPointer(Address addr) const { auto intTy = mlir::dyn_cast<cir::IntType>(addr.getElementType()); // Don't bother with int casts if the integer size is the same. @@ -211,10 +225,38 @@ bool AtomicInfo::emitMemSetZeroIfNecessary() const { return false; cgf.cgm.errorNYI(loc, - "AtomicInfo::emitMemSetZeroIfNecessary: emit memset zero"); + "AtomicInfo::emitMemSetZeroIfNecaessary: emit memset zero"); return false; } +/// Return true if \param valueTy is a type that should be casted to integer +/// around the atomic memory operation. If \param cmpxchg is true, then the +/// cast of a floating point type is made as that instruction can not have +/// floating point operands. TODO: Allow compare-and-exchange and FP - see +/// comment in CIRGenAtomicExpandPass.cpp. +static bool shouldCastToInt(mlir::Type valueTy, bool cmpxchg) { + if (cir::isAnyFloatingPointType(valueTy)) + return isa<cir::FP80Type>(valueTy) || cmpxchg; + return !isa<cir::IntType>(valueTy) && !isa<cir::PointerType>(valueTy); +} + +mlir::Value AtomicInfo::convertRValueToInt(RValue rvalue, bool cmpxchg) const { + // If we've got a scalar value of the right size, try to avoid going + // through memory. Floats get casted if needed by AtomicExpandPass. + if (mlir::Value value = getScalarRValValueOrNull(rvalue)) { + if (!shouldCastToInt(value.getType(), cmpxchg)) + return cgf.emitToMemory(value, valueTy); + + cgf.cgm.errorNYI( + loc, "AtomicInfo::convertRValueToInt: cast scalar rvalue to int"); + return nullptr; + } + + cgf.cgm.errorNYI( + loc, "AtomicInfo::convertRValueToInt: cast non-scalar rvalue to int"); + return nullptr; +} + /// Copy an r-value into memory as part of storing to an atomic type. /// This needs to create a bit-pattern suitable for atomic operations. void AtomicInfo::emitCopyIntoMemory(RValue rvalue) const { @@ -815,6 +857,79 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) { e->getExprLoc()); } +void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest, bool isInit) { + bool isVolatile = dest.isVolatileQualified(); + auto order = cir::MemOrder::SequentiallyConsistent; + if (!dest.getType()->isAtomicType()) { + assert(!cir::MissingFeatures::atomicMicrosoftVolatile()); + } + return emitAtomicStore(rvalue, dest, order, isVolatile, isInit); +} + +/// Emit a store to an l-value of atomic type. +/// +/// Note that the r-value is expected to be an r-value of the atomic type; this +/// means that for aggregate r-values, it should include storage for any padding +/// that was necessary. +void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest, + cir::MemOrder order, bool isVolatile, + bool isInit) { + // If this is an aggregate r-value, it should agree in type except + // maybe for address-space qualification. + mlir::Location loc = dest.getPointer().getLoc(); + assert(!rvalue.isAggregate() || + rvalue.getAggregateAddress().getElementType() == + dest.getAddress().getElementType()); + + AtomicInfo atomics(*this, dest, loc); + LValue lvalue = atomics.getAtomicLValue(); + + if (lvalue.isSimple()) { + // If this is an initialization, just put the value there normally. + if (isInit) { + atomics.emitCopyIntoMemory(rvalue); + return; + } + + // Check whether we should use a library call. + if (atomics.shouldUseLibCall()) { + assert(!cir::MissingFeatures::atomicUseLibCall()); + cgm.errorNYI(loc, "emitAtomicStore: atomic store with library call"); + return; + } + + // Okay, we're doing this natively. + mlir::Value valueToStore = atomics.convertRValueToInt(rvalue); + + // Do the atomic store. + Address addr = atomics.getAtomicAddress(); + if (mlir::Value value = atomics.getScalarRValValueOrNull(rvalue)) { + if (shouldCastToInt(value.getType(), /*CmpXchg=*/false)) { + addr = atomics.castToAtomicIntPointer(addr); + valueToStore = + builder.createIntCast(valueToStore, addr.getElementType()); + } + } + cir::StoreOp store = builder.createStore(loc, valueToStore, addr); + + // Initializations don't need to be atomic. + if (!isInit) { + assert(!cir::MissingFeatures::atomicOpenMP()); + store.setMemOrder(order); + } + + // Other decoration. + if (isVolatile) + store.setIsVolatile(true); + + assert(!cir::MissingFeatures::opLoadStoreTbaa()); + return; + } + + cgm.errorNYI(loc, "emitAtomicStore: non-simple atomic lvalue"); + assert(!cir::MissingFeatures::opLoadStoreAtomic()); +} + void CIRGenFunction::emitAtomicInit(Expr *init, LValue dest) { AtomicInfo atomics(*this, dest, getLoc(init->getSourceRange())); diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index 4fb178df0e508..422fa1cf5ad2e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -311,7 +311,8 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e, void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile, QualType ty, - bool isInit, bool isNontemporal) { + LValueBaseInfo baseInfo, bool isInit, + bool isNontemporal) { assert(!cir::MissingFeatures::opLoadStoreThreadLocal()); if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) { @@ -333,7 +334,13 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr, value = emitToMemory(value, ty); - assert(!cir::MissingFeatures::opLoadStoreAtomic()); + assert(!cir::MissingFeatures::opLoadStoreTbaa()); + LValue atomicLValue = LValue::makeAddr(addr, ty, baseInfo); + if (ty->isAtomicType() || + (!isInit && isLValueSuitableForInlineAtomic(atomicLValue))) { + emitAtomicStore(RValue::get(value), atomicLValue, isInit); + return; + } // Update the alloca with more info on initialization. assert(addr.getPointer() && "expected pointer to exist"); @@ -550,7 +557,8 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, LValue lvalue, } emitStoreOfScalar(value, lvalue.getAddress(), lvalue.isVolatile(), - lvalue.getType(), isInit, /*isNontemporal=*/false); + lvalue.getType(), lvalue.getBaseInfo(), isInit, + /*isNontemporal=*/false); } mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile, diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index dece642eb13b6..1c52a78d72e33 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1271,6 +1271,9 @@ class CIRGenFunction : public CIRGenTypeCache { RValue emitAtomicExpr(AtomicExpr *e); void emitAtomicInit(Expr *init, LValue dest); + void emitAtomicStore(RValue rvalue, LValue dest, bool isInit); + void emitAtomicStore(RValue rvalue, LValue dest, cir::MemOrder order, + bool isVolatile, bool isInit); AutoVarEmission emitAutoVarAlloca(const clang::VarDecl &d, mlir::OpBuilder::InsertPoint ip = {}); @@ -1680,8 +1683,8 @@ class CIRGenFunction : public CIRGenTypeCache { bool isInit); void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile, - clang::QualType ty, bool isInit = false, - bool isNontemporal = false); + clang::QualType ty, LValueBaseInfo baseInfo, + bool isInit = false, bool isNontemporal = false); void emitStoreOfScalar(mlir::Value value, LValue lvalue, bool isInit); /// Store the specified rvalue into the specified diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c index 65799881a0cbe..d5bea8446d730 100644 --- a/clang/test/CIR/CodeGen/atomic.c +++ b/clang/test/CIR/CodeGen/atomic.c @@ -46,6 +46,32 @@ void f2(void) { // OGCG-NEXT: store i32 42, ptr %[[SLOT]], align 4 // OGCG: } +void f3(_Atomic(int) *p) { + *p = 42; +} + +// CIR-LABEL: @f3 +// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !s32i, !cir.ptr<!s32i> + +// LLVM-LABEL: @f3 +// LLVM: store atomic i32 42, ptr %{{.+}} seq_cst, align 4 + +// OGCG-LABEL: @f3 +// OGCG: store atomic i32 42, ptr %{{.+}} seq_cst, align 4 + +void f4(_Atomic(float) *p) { + *p = 3.14; +} + +// CIR-LABEL: @f4 +// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !cir.float, !cir.ptr<!cir.float> + +// LLVM-LABEL: @f4 +// LLVM: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4 + +// OGCG-LABEL: @f4 +// OGCG: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4 + void load(int *ptr) { int x; __atomic_load(ptr, &x, __ATOMIC_RELAXED); From a38e0942407ef3395264831ef971838b69d2a652 Mon Sep 17 00:00:00 2001 From: Matthias Springer <me@m-sp.org> Date: Wed, 5 Nov 2025 21:04:32 +0900 Subject: [PATCH 305/313] [mlir] Dialect Conversion: Add support for post-order legalization order (#166292) By default, the dialect conversion driver processes operations in pre-order: the initial worklist is populated pre-order. (New/modified operations are immediately legalized recursively.) This commit adds a new API for selective post-order legalization. Patterns can request an operation / region legalization via `ConversionPatternRewriter::legalize`. They can call these helper functions on nested regions before rewriting the operation itself. Note: In rollback mode, a failed recursive legalization typically leads to a conversion failure. Since recursive legalization is performed by separate pattern applications, there is no way for the original pattern to recover from such a failure. --- .../mlir/Transforms/DialectConversion.h | 25 +++- .../Transforms/Utils/DialectConversion.cpp | 120 +++++++++++++----- mlir/test/Transforms/test-legalizer-full.mlir | 18 +++ .../Transforms/test-legalizer-rollback.mlir | 19 +++ mlir/test/Transforms/test-legalizer.mlir | 32 +++++ mlir/test/lib/Dialect/Test/TestPatterns.cpp | 22 +++- 6 files changed, 199 insertions(+), 37 deletions(-) diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index ed7e2a08ebfd9..5ac9e26e8636d 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -981,6 +981,28 @@ class ConversionPatternRewriter final : public PatternRewriter { /// Return a reference to the internal implementation. detail::ConversionPatternRewriterImpl &getImpl(); + /// Attempt to legalize the given operation. This can be used within + /// conversion patterns to change the default pre-order legalization order. + /// Returns "success" if the operation was legalized, "failure" otherwise. + /// + /// Note: In a partial conversion, this function returns "success" even if + /// the operation could not be legalized, as long as it was not explicitly + /// marked as illegal in the conversion target. + LogicalResult legalize(Operation *op); + + /// Attempt to legalize the given region. This can be used within + /// conversion patterns to change the default pre-order legalization order. + /// Returns "success" if the region was legalized, "failure" otherwise. + /// + /// If the current pattern runs with a type converter, the entry block + /// signature will be converted before legalizing the operations in the + /// region. + /// + /// Note: In a partial conversion, this function returns "success" even if + /// an operation could not be legalized, as long as it was not explicitly + /// marked as illegal in the conversion target. + LogicalResult legalize(Region *r); + private: // Allow OperationConverter to construct new rewriters. friend struct OperationConverter; @@ -989,7 +1011,8 @@ class ConversionPatternRewriter final : public PatternRewriter { /// conversions. They apply some IR rewrites in a delayed fashion and could /// bring the IR into an inconsistent state when used standalone. explicit ConversionPatternRewriter(MLIRContext *ctx, - const ConversionConfig &config); + const ConversionConfig &config, + OperationConverter &converter); // Hide unsupported pattern rewriter API. using OpBuilder::setListener; diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 2fe06970eb568..f8c38fadbd229 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -92,6 +92,22 @@ static OpBuilder::InsertPoint computeInsertPoint(ArrayRef<Value> vals) { return pt; } +namespace { +enum OpConversionMode { + /// In this mode, the conversion will ignore failed conversions to allow + /// illegal operations to co-exist in the IR. + Partial, + + /// In this mode, all operations must be legal for the given target for the + /// conversion to succeed. + Full, + + /// In this mode, operations are analyzed for legality. No actual rewrites are + /// applied to the operations on success. + Analysis, +}; +} // namespace + //===----------------------------------------------------------------------===// // ConversionValueMapping //===----------------------------------------------------------------------===// @@ -866,8 +882,9 @@ namespace mlir { namespace detail { struct ConversionPatternRewriterImpl : public RewriterBase::Listener { explicit ConversionPatternRewriterImpl(ConversionPatternRewriter &rewriter, - const ConversionConfig &config) - : rewriter(rewriter), config(config), + const ConversionConfig &config, + OperationConverter &opConverter) + : rewriter(rewriter), config(config), opConverter(opConverter), notifyingRewriter(rewriter.getContext(), config.listener) {} //===--------------------------------------------------------------------===// @@ -1124,6 +1141,9 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Dialect conversion configuration. const ConversionConfig &config; + /// The operation converter to use for recursive legalization. + OperationConverter &opConverter; + /// A set of erased operations. This set is utilized only if /// `allowPatternRollback` is set to "false". Conceptually, this set is /// similar to `replacedOps` (which is maintained when the flag is set to @@ -2084,9 +2104,10 @@ void ConversionPatternRewriterImpl::notifyMatchFailure( //===----------------------------------------------------------------------===// ConversionPatternRewriter::ConversionPatternRewriter( - MLIRContext *ctx, const ConversionConfig &config) - : PatternRewriter(ctx), - impl(new detail::ConversionPatternRewriterImpl(*this, config)) { + MLIRContext *ctx, const ConversionConfig &config, + OperationConverter &opConverter) + : PatternRewriter(ctx), impl(new detail::ConversionPatternRewriterImpl( + *this, config, opConverter)) { setListener(impl.get()); } @@ -2207,6 +2228,37 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys, return success(); } +LogicalResult ConversionPatternRewriter::legalize(Region *r) { + // Fast path: If the region is empty, there is nothing to legalize. + if (r->empty()) + return success(); + + // Gather a list of all operations to legalize. This is done before + // converting the entry block signature because unrealized_conversion_cast + // ops should not be included. + SmallVector<Operation *> ops; + for (Block &b : *r) + for (Operation &op : b) + ops.push_back(&op); + + // If the current pattern runs with a type converter, convert the entry block + // signature. + if (const TypeConverter *converter = impl->currentTypeConverter) { + std::optional<TypeConverter::SignatureConversion> conversion = + converter->convertBlockSignature(&r->front()); + if (!conversion) + return failure(); + applySignatureConversion(&r->front(), *conversion, converter); + } + + // Legalize all operations in the region. + for (Operation *op : ops) + if (failed(legalize(op))) + return failure(); + + return success(); +} + void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest, Block::iterator before, ValueRange argValues) { @@ -3192,22 +3244,6 @@ static void reconcileUnrealizedCasts( // OperationConverter //===----------------------------------------------------------------------===// -namespace { -enum OpConversionMode { - /// In this mode, the conversion will ignore failed conversions to allow - /// illegal operations to co-exist in the IR. - Partial, - - /// In this mode, all operations must be legal for the given target for the - /// conversion to succeed. - Full, - - /// In this mode, operations are analyzed for legality. No actual rewrites are - /// applied to the operations on success. - Analysis, -}; -} // namespace - namespace mlir { // This class converts operations to a given conversion target via a set of // rewrite patterns. The conversion behaves differently depending on the @@ -3217,16 +3253,20 @@ struct OperationConverter { const FrozenRewritePatternSet &patterns, const ConversionConfig &config, OpConversionMode mode) - : rewriter(ctx, config), opLegalizer(rewriter, target, patterns), + : rewriter(ctx, config, *this), opLegalizer(rewriter, target, patterns), mode(mode) {} /// Converts the given operations to the conversion target. LogicalResult convertOperations(ArrayRef<Operation *> ops); -private: - /// Converts an operation with the given rewriter. - LogicalResult convert(Operation *op); + /// Converts a single operation. If `isRecursiveLegalization` is "true", the + /// conversion is a recursive legalization request, triggered from within a + /// pattern. In that case, do not emit errors because there will be another + /// attempt at legalizing the operation later (via the regular pre-order + /// legalization mechanism). + LogicalResult convert(Operation *op, bool isRecursiveLegalization = false); +private: /// The rewriter to use when converting operations. ConversionPatternRewriter rewriter; @@ -3238,32 +3278,42 @@ struct OperationConverter { }; } // namespace mlir -LogicalResult OperationConverter::convert(Operation *op) { +LogicalResult ConversionPatternRewriter::legalize(Operation *op) { + return impl->opConverter.convert(op, /*isRecursiveLegalization=*/true); +} + +LogicalResult OperationConverter::convert(Operation *op, + bool isRecursiveLegalization) { const ConversionConfig &config = rewriter.getConfig(); // Legalize the given operation. if (failed(opLegalizer.legalize(op))) { // Handle the case of a failed conversion for each of the different modes. // Full conversions expect all operations to be converted. - if (mode == OpConversionMode::Full) - return op->emitError() - << "failed to legalize operation '" << op->getName() << "'"; + if (mode == OpConversionMode::Full) { + if (!isRecursiveLegalization) + op->emitError() << "failed to legalize operation '" << op->getName() + << "'"; + return failure(); + } // Partial conversions allow conversions to fail iff the operation was not // explicitly marked as illegal. If the user provided a `unlegalizedOps` // set, non-legalizable ops are added to that set. if (mode == OpConversionMode::Partial) { - if (opLegalizer.isIllegal(op)) - return op->emitError() - << "failed to legalize operation '" << op->getName() - << "' that was explicitly marked illegal"; - if (config.unlegalizedOps) + if (opLegalizer.isIllegal(op)) { + if (!isRecursiveLegalization) + op->emitError() << "failed to legalize operation '" << op->getName() + << "' that was explicitly marked illegal"; + return failure(); + } + if (config.unlegalizedOps && !isRecursiveLegalization) config.unlegalizedOps->insert(op); } } else if (mode == OpConversionMode::Analysis) { // Analysis conversions don't fail if any operations fail to legalize, // they are only interested in the operations that were successfully // legalized. - if (config.legalizableOps) + if (config.legalizableOps && !isRecursiveLegalization) config.legalizableOps->insert(op); } return success(); diff --git a/mlir/test/Transforms/test-legalizer-full.mlir b/mlir/test/Transforms/test-legalizer-full.mlir index 42cec68b9fbbb..8da9109a32762 100644 --- a/mlir/test/Transforms/test-legalizer-full.mlir +++ b/mlir/test/Transforms/test-legalizer-full.mlir @@ -72,3 +72,21 @@ builtin.module { } } + +// ----- + +// The region of "test.post_order_legalization" is converted before the op. + +// expected-remark@+1 {{applyFullConversion failed}} +builtin.module { +func.func @test_preorder_legalization() { + // expected-error@+1 {{failed to legalize operation 'test.post_order_legalization'}} + "test.post_order_legalization"() ({ + ^bb0(%arg0: i64): + // Not-explicitly-legal ops are not allowed to survive. + "test.remaining_consumer"(%arg0) : (i64) -> () + "test.invalid"(%arg0) : (i64) -> () + }) : () -> () + return +} +} diff --git a/mlir/test/Transforms/test-legalizer-rollback.mlir b/mlir/test/Transforms/test-legalizer-rollback.mlir index 71e11782e14b0..4bcca6b7e5228 100644 --- a/mlir/test/Transforms/test-legalizer-rollback.mlir +++ b/mlir/test/Transforms/test-legalizer-rollback.mlir @@ -163,3 +163,22 @@ func.func @create_unregistered_op_in_pattern() -> i32 { "test.return"(%0) : (i32) -> () } } + +// ----- + +// CHECK-LABEL: func @test_failed_preorder_legalization +// CHECK: "test.post_order_legalization"() ({ +// CHECK: %[[r:.*]] = "test.illegal_op_g"() : () -> i32 +// CHECK: "test.return"(%[[r]]) : (i32) -> () +// CHECK: }) : () -> () +// expected-remark @+1 {{applyPartialConversion failed}} +module { +func.func @test_failed_preorder_legalization() { + // expected-error @+1 {{failed to legalize operation 'test.post_order_legalization' that was explicitly marked illegal}} + "test.post_order_legalization"() ({ + %0 = "test.illegal_op_g"() : () -> (i32) + "test.return"(%0) : (i32) -> () + }) : () -> () + return +} +} diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index 7c43bb7bface0..88a71cc26ab0c 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -448,3 +448,35 @@ func.func @test_working_1to1_pattern(%arg0: f16) { "test.type_consumer"(%arg0) : (f16) -> () "test.return"() : () -> () } + +// ----- + +// The region of "test.post_order_legalization" is converted before the op. + +// CHECK: notifyBlockInserted into test.post_order_legalization: was unlinked +// CHECK: notifyOperationInserted: test.invalid +// CHECK: notifyBlockErased +// CHECK: notifyOperationInserted: test.valid, was unlinked +// CHECK: notifyOperationReplaced: test.invalid +// CHECK: notifyOperationErased: test.invalid +// CHECK: notifyOperationModified: test.post_order_legalization + +// CHECK-LABEL: func @test_preorder_legalization +// CHECK: "test.post_order_legalization"() ({ +// CHECK: ^{{.*}}(%[[arg0:.*]]: f64): +// Note: The survival of a not-explicitly-invalid operation does *not* cause +// a conversion failure in when applying a partial conversion. +// CHECK: %[[cast:.*]] = "test.cast"(%[[arg0]]) : (f64) -> i64 +// CHECK: "test.remaining_consumer"(%[[cast]]) : (i64) -> () +// CHECK: "test.valid"(%[[arg0]]) : (f64) -> () +// CHECK: }) {is_legal} : () -> () +func.func @test_preorder_legalization() { + "test.post_order_legalization"() ({ + ^bb0(%arg0: i64): + // expected-remark @+1 {{'test.remaining_consumer' is not legalizable}} + "test.remaining_consumer"(%arg0) : (i64) -> () + "test.invalid"(%arg0) : (i64) -> () + }) : () -> () + // expected-remark @+1 {{'func.return' is not legalizable}} + return +} diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index 12edecc113495..9b64bc691588d 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -1418,6 +1418,22 @@ class TestTypeConsumerOpPattern } }; +class TestPostOrderLegalization : public ConversionPattern { +public: + TestPostOrderLegalization(MLIRContext *ctx, const TypeConverter &converter) + : ConversionPattern(converter, "test.post_order_legalization", 1, ctx) {} + LogicalResult + matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands, + ConversionPatternRewriter &rewriter) const final { + for (Region &r : op->getRegions()) + if (failed(rewriter.legalize(&r))) + return failure(); + rewriter.modifyOpInPlace( + op, [&]() { op->setAttr("is_legal", rewriter.getUnitAttr()); }); + return success(); + } +}; + /// Test unambiguous overload resolution of replaceOpWithMultiple. This /// function is just to trigger compiler errors. It is never executed. [[maybe_unused]] void testReplaceOpWithMultipleOverloads( @@ -1532,7 +1548,8 @@ struct TestLegalizePatternDriver patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp, TestPassthroughInvalidOp, TestMultiple1ToNReplacement, TestValueReplace, TestReplaceWithValidConsumer, - TestTypeConsumerOpPattern>(&getContext(), converter); + TestTypeConsumerOpPattern, TestPostOrderLegalization>( + &getContext(), converter); patterns.add<TestConvertBlockArgs>(converter, &getContext()); mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns, converter); @@ -1560,6 +1577,9 @@ struct TestLegalizePatternDriver target.addDynamicallyLegalOp( OperationName("test.value_replace", &getContext()), [](Operation *op) { return op->hasAttr("is_legal"); }); + target.addDynamicallyLegalOp( + OperationName("test.post_order_legalization", &getContext()), + [](Operation *op) { return op->hasAttr("is_legal"); }); // TestCreateUnregisteredOp creates `arith.constant` operation, // which was not added to target intentionally to test From 6c640b86e6e03298385231cb7e77d2f3524bc643 Mon Sep 17 00:00:00 2001 From: Matthias Springer <me@m-sp.org> Date: Wed, 5 Nov 2025 21:24:59 +0900 Subject: [PATCH 306/313] [mlir][LLVM] Fix unsupported FP lowering in `VectorConvertToLLVMPattern` (#166513) Fixes a bug in `VectorConvertToLLVMPattern`, which converted operations with unsupported FP types. E.g., `arith.addf ... : f4E2M1FN` was lowered to `llvm.fadd ... : i4`, which does not verify. There are a few more patterns that have the same bug. Those will be fixed in follow-up PRs. This commit is in preparation of adding an `APFloat`-based lowering for `arith` operations with unsupported floating-point types. --- .../Conversion/LLVMCommon/VectorPattern.h | 31 +++++++++++++++++++ .../Conversion/ArithToLLVM/arith-to-llvm.mlir | 26 ++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h index 964281592cc65..cad6cec761ab8 100644 --- a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h @@ -92,12 +92,43 @@ class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> { using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern; using Super = VectorConvertToLLVMPattern<SourceOp, TargetOp>; + /// Return the given type if it's a floating point type. If the given type is + /// a vector type, return its element type if it's a floating point type. + static FloatType getFloatingPointType(Type type) { + if (auto floatType = dyn_cast<FloatType>(type)) + return floatType; + if (auto vecType = dyn_cast<VectorType>(type)) + return dyn_cast<FloatType>(vecType.getElementType()); + return nullptr; + } + LogicalResult matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { static_assert( std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value, "expected single result op"); + + // The pattern should not apply if a floating-point operand is converted to + // a non-floating-point type. This indicates that the floating point type + // is not supported by the LLVM lowering. (Such types are converted to + // integers.) + auto checkType = [&](Value v) -> LogicalResult { + FloatType floatType = getFloatingPointType(v.getType()); + if (!floatType) + return success(); + Type convertedType = this->getTypeConverter()->convertType(floatType); + if (!isa_and_nonnull<FloatType>(convertedType)) + return rewriter.notifyMatchFailure(op, + "unsupported floating point type"); + return success(); + }; + for (Value operand : op->getOperands()) + if (failed(checkType(operand))) + return failure(); + if (failed(checkType(op->getResult(0)))) + return failure(); + // Determine attributes for the target op AttrConvert<SourceOp, TargetOp> attrConvert(op); diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir index ba12ff29ebef9..b5dcb01d3dc6b 100644 --- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir +++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir @@ -747,3 +747,29 @@ func.func @memref_bitcast(%1: memref<?xi16>) -> memref<?xbf16> { %2 = arith.bitcast %1 : memref<?xi16> to memref<?xbf16> func.return %2 : memref<?xbf16> } + +// ----- + +// CHECK-LABEL: func @unsupported_fp_type +// CHECK: arith.addf {{.*}} : f4E2M1FN +// CHECK: arith.addf {{.*}} : vector<4xf4E2M1FN> +// CHECK: arith.addf {{.*}} : vector<8x4xf4E2M1FN> +func.func @unsupported_fp_type(%arg0: f4E2M1FN, %arg1: vector<4xf4E2M1FN>, %arg2: vector<8x4xf4E2M1FN>) -> (f4E2M1FN, vector<4xf4E2M1FN>, vector<8x4xf4E2M1FN>) { + %0 = arith.addf %arg0, %arg0 : f4E2M1FN + %1 = arith.addf %arg1, %arg1 : vector<4xf4E2M1FN> + %2 = arith.addf %arg2, %arg2 : vector<8x4xf4E2M1FN> + return %0, %1, %2 : f4E2M1FN, vector<4xf4E2M1FN>, vector<8x4xf4E2M1FN> +} + +// ----- + +// CHECK-LABEL: func @supported_fp_type +// CHECK: llvm.fadd {{.*}} : f32 +// CHECK: llvm.fadd {{.*}} : vector<4xf32> +// CHECK-COUNT-4: llvm.fadd {{.*}} : vector<8xf32> +func.func @supported_fp_type(%arg0: f32, %arg1: vector<4xf32>, %arg2: vector<4x8xf32>) -> (f32, vector<4xf32>, vector<4x8xf32>) { + %0 = arith.addf %arg0, %arg0 : f32 + %1 = arith.addf %arg1, %arg1 : vector<4xf32> + %2 = arith.addf %arg2, %arg2 : vector<4x8xf32> + return %0, %1, %2 : f32, vector<4xf32>, vector<4x8xf32> +} From c782ed3440b5a1565428db9731504fd1c4c2a9a9 Mon Sep 17 00:00:00 2001 From: Valery Pykhtin <valery.pykhtin@amd.com> Date: Wed, 5 Nov 2025 13:31:10 +0100 Subject: [PATCH 307/313] [utils][UpdateLLCTestChecks] Add MIR support to update_llc_test_checks.py. (#164965) This change enables update_llc_test_checks.py to automatically generate MIR checks for RUN lines that use `-stop-before` or `-stop-after` flags allowing tests to verify intermediate compilation stages (e.g., after instruction selection but before peephole optimizations) alongside the final assembly output. If `-debug-only` flag is present in the run line it's considered as the main point of interest for testing and stop flags above are ignored (that is no MIR checks are generated). This resulted from the scenario, when I needed to test two instruction matching patterns where the later pattern in the peepholer reverts the earlier pattern in the instruction selector and distinguish it from the case when the earlier pattern didn't worked at all. Initially created by Claude Sonnet 4.5 it was improved later to handle conflicts in MIR <-> ASM prefixes and formatting. --- .../Inputs/x86_asm_mir_mixed.ll | 17 ++++ .../Inputs/x86_asm_mir_mixed.ll.expected | 45 +++++++++ .../Inputs/x86_asm_mir_same_prefix.ll | 13 +++ .../x86_asm_mir_same_prefix.ll.expected | 16 ++++ .../x86-asm-mir-mixed.test | 9 ++ .../x86-asm-mir-same-prefix.test | 7 ++ llvm/utils/UpdateTestChecks/common.py | 1 + llvm/utils/UpdateTestChecks/mir.py | 11 ++- llvm/utils/update_llc_test_checks.py | 94 +++++++++++++++---- 9 files changed, 194 insertions(+), 19 deletions(-) create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll new file mode 100644 index 0000000000000..292637177591f --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll @@ -0,0 +1,17 @@ +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR + +define i64 @test1(i64 %i) nounwind readnone { + %loc = alloca i64 + %j = load i64, ptr %loc + %r = add i64 %i, %j + ret i64 %r +} + +define i64 @test2(i32 %i) nounwind readnone { + %loc = alloca i32 + %j = load i32, ptr %loc + %r = add i32 %i, %j + %ext = zext i32 %r to i64 + ret i64 %ext +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected new file mode 100644 index 0000000000000..88cb03e85204a --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR + +define i64 @test1(i64 %i) nounwind readnone { +; ASM-LABEL: test1: +; ASM: # %bb.0: +; ASM-NEXT: movq %rdi, %rax +; ASM-NEXT: addq -{{[0-9]+}}(%rsp), %rax +; ASM-NEXT: retq +; MIR-LABEL: name: test1 +; MIR: bb.0 (%ir-block.0): +; MIR-NEXT: liveins: $rdi +; MIR-NEXT: {{ $}} +; MIR-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rdi +; MIR-NEXT: [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s64) from %ir.loc) +; MIR-NEXT: $rax = COPY [[ADD64rm]] +; MIR-NEXT: RET 0, $rax + %loc = alloca i64 + %j = load i64, ptr %loc + %r = add i64 %i, %j + ret i64 %r +} + +define i64 @test2(i32 %i) nounwind readnone { +; ASM-LABEL: test2: +; ASM: # %bb.0: +; ASM-NEXT: movl %edi, %eax +; ASM-NEXT: addl -{{[0-9]+}}(%rsp), %eax +; ASM-NEXT: retq +; MIR-LABEL: name: test2 +; MIR: bb.0 (%ir-block.0): +; MIR-NEXT: liveins: $edi +; MIR-NEXT: {{ $}} +; MIR-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edi +; MIR-NEXT: [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s32) from %ir.loc) +; MIR-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[ADD32rm]], %subreg.sub_32bit +; MIR-NEXT: $rax = COPY [[SUBREG_TO_REG]] +; MIR-NEXT: RET 0, $rax + %loc = alloca i32 + %j = load i32, ptr %loc + %r = add i32 %i, %j + %ext = zext i32 %r to i64 + ret i64 %ext +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll new file mode 100644 index 0000000000000..7167bcf258e68 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK + +define i32 @add(i32 %a, i32 %b) { + %sum = add i32 %a, %b + ret i32 %sum +} + +define i32 @sub(i32 %a, i32 %b) { + %diff = sub i32 %a, %b + ret i32 %diff +} + diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected new file mode 100644 index 0000000000000..1ba920d1de8b0 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK + +define i32 @add(i32 %a, i32 %b) { + %sum = add i32 %a, %b + ret i32 %sum +} + +define i32 @sub(i32 %a, i32 %b) { + %diff = sub i32 %a, %b + ret i32 %diff +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test new file mode 100644 index 0000000000000..6fc57b583b37d --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test @@ -0,0 +1,9 @@ +# REQUIRES: x86-registered-target +## Test checking that update_llc_test_checks.py can generate both ASM and MIR checks in the same file + +# RUN: cp -f %S/Inputs/x86_asm_mir_mixed.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll + +## Verify that running the script again on an already updated file doesn't add duplicate checks +# RUN: %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test new file mode 100644 index 0000000000000..bb91a44678f1a --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test @@ -0,0 +1,7 @@ +## Test that using the same prefix for both ASM and MIR outputs generates a warning +## and doesn't produce any checks. + +# RUN: cp -f %S/Inputs/x86_asm_mir_same_prefix.ll %t.ll && %update_llc_test_checks %t.ll 2>&1 | FileCheck %s --check-prefix=WARNING +# RUN: diff -u %S/Inputs/x86_asm_mir_same_prefix.ll.expected %t.ll + +# WARNING: WARNING: The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: CHECK diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index 2dad16a8eebb7..baa0377cd8b81 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -605,6 +605,7 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False): TRIPLE_ARG_RE = re.compile(r"-m?triple[= ]([^ ]+)") MARCH_ARG_RE = re.compile(r"-march[= ]([^ ]+)") DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)") +STOP_PASS_RE = re.compile(r"-stop-(before|after)=(\w+)") IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_") IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\S+") diff --git a/llvm/utils/UpdateTestChecks/mir.py b/llvm/utils/UpdateTestChecks/mir.py index 24bb8b341d335..01ee0e19f7cb9 100644 --- a/llvm/utils/UpdateTestChecks/mir.py +++ b/llvm/utils/UpdateTestChecks/mir.py @@ -163,13 +163,15 @@ def add_mir_checks_for_function( print_fixed_stack, first_check_is_next, at_the_function_name, + check_indent=None, ): printed_prefixes = set() for run in run_list: for prefix in run[0]: if prefix in printed_prefixes: break - if not func_dict[prefix][func_name]: + # func_info can be empty if there was a prefix conflict. + if not func_dict[prefix].get(func_name): continue if printed_prefixes: # Add some space between different check prefixes. @@ -185,6 +187,7 @@ def add_mir_checks_for_function( func_dict[prefix][func_name], print_fixed_stack, first_check_is_next, + check_indent, ) break else: @@ -204,6 +207,7 @@ def add_mir_check_lines( func_info, print_fixed_stack, first_check_is_next, + check_indent=None, ): func_body = str(func_info).splitlines() if single_bb: @@ -220,7 +224,10 @@ def add_mir_check_lines( first_line = func_body[0] indent = len(first_line) - len(first_line.lstrip(" ")) # A check comment, indented the appropriate amount - check = "{:>{}}; {}".format("", indent, prefix) + if check_indent is not None: + check = "{}; {}".format(check_indent, prefix) + else: + check = "{:>{}}; {}".format("", indent, prefix) output_lines.append("{}-LABEL: name: {}".format(check, func_name)) diff --git a/llvm/utils/update_llc_test_checks.py b/llvm/utils/update_llc_test_checks.py index 8c57e75f34f75..98864be62875b 100755 --- a/llvm/utils/update_llc_test_checks.py +++ b/llvm/utils/update_llc_test_checks.py @@ -15,7 +15,7 @@ import os # Used to advertise this file's name ("autogenerated_note"). import sys -from UpdateTestChecks import common +from UpdateTestChecks import common, mir # llc is the only llc-like in the LLVM tree but downstream forks can add # additional ones here if they have them. @@ -33,6 +33,7 @@ def update_test(ti: common.TestInfo): break run_list = [] + mir_run_list = [] for l in ti.run_lines: if "|" not in l: common.warn("Skipping unparsable RUN line: " + l) @@ -57,9 +58,14 @@ def update_test(ti: common.TestInfo): if m: march_in_cmd = m.groups()[0] + target_list = run_list m = common.DEBUG_ONLY_ARG_RE.search(llc_cmd) if m and m.groups()[0] == "isel": from UpdateTestChecks import isel as output_type + elif not m and common.STOP_PASS_RE.search(llc_cmd): + # MIR output mode. If -debug-only is present assume + # the debug output is the main point of interest. + target_list = mir_run_list else: from UpdateTestChecks import asm as output_type @@ -84,7 +90,7 @@ def update_test(ti: common.TestInfo): # FIXME: We should use multiple check prefixes to common check lines. For # now, we just ignore all but the last. - run_list.append( + target_list.append( ( check_prefixes, llc_tool, @@ -119,14 +125,20 @@ def update_test(ti: common.TestInfo): ginfo=ginfo, ) - for ( - prefixes, - llc_tool, - llc_args, - preprocess_cmd, - triple_in_cmd, - march_in_cmd, - ) in run_list: + # Dictionary to store MIR function bodies separately + mir_func_dict = {} + for run_tuple, is_mir in [(run, False) for run in run_list] + [ + (run, True) for run in mir_run_list + ]: + ( + prefixes, + llc_tool, + llc_args, + preprocess_cmd, + triple_in_cmd, + march_in_cmd, + ) = run_tuple + common.debug("Extracted LLC cmd:", llc_tool, llc_args) common.debug("Extracted FileCheck prefixes:", str(prefixes)) @@ -141,22 +153,54 @@ def update_test(ti: common.TestInfo): if not triple: triple = common.get_triple_from_march(march_in_cmd) - scrubber, function_re = output_type.get_run_handler(triple) - if 0 == builder.process_run_line( - function_re, scrubber, raw_tool_output, prefixes - ): - common.warn( - "Couldn't match any function. Possibly the wrong target triple has been provided" + if is_mir: + # MIR output mode + common.debug("Detected MIR output mode for prefixes:", str(prefixes)) + for prefix in prefixes: + if prefix not in mir_func_dict: + mir_func_dict[prefix] = {} + + mir.build_function_info_dictionary( + ti.path, + raw_tool_output, + triple, + prefixes, + mir_func_dict, + ti.args.verbose, ) - builder.processed_prefixes(prefixes) + else: + # ASM output mode + scrubber, function_re = output_type.get_run_handler(triple) + if 0 == builder.process_run_line( + function_re, scrubber, raw_tool_output, prefixes + ): + common.warn( + "Couldn't match any function. Possibly the wrong target triple has been provided" + ) + builder.processed_prefixes(prefixes) func_dict = builder.finish_and_get_func_dict() + + # Check for conflicts: same prefix used for both ASM and MIR + conflicting_prefixes = set(func_dict.keys()) & set(mir_func_dict.keys()) + if conflicting_prefixes: + common.warn( + "The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: {}".format( + ", ".join(sorted(conflicting_prefixes)) + ), + test_file=ti.path, + ) + for prefix in conflicting_prefixes: + mir_func_dict[prefix] = {} + func_dict[prefix] = {} + global_vars_seen_dict = {} is_in_function = False is_in_function_start = False func_name = None prefix_set = set([prefix for p in run_list for prefix in p[0]]) + prefix_set.update([prefix for p in mir_run_list for prefix in p[0]]) common.debug("Rewriting FileCheck prefixes:", str(prefix_set)) output_lines = [] @@ -221,6 +265,22 @@ def update_test(ti: common.TestInfo): is_filtered=builder.is_filtered(), ) ) + + # Also add MIR checks if we have them for this function + if mir_run_list and func_name: + mir.add_mir_checks_for_function( + ti.path, + output_lines, + mir_run_list, + mir_func_dict, + func_name, + single_bb=False, # Don't skip basic block labels. + print_fixed_stack=False, # Don't print fixed stack (ASM tests don't need it). + first_check_is_next=False, # First check is LABEL, not NEXT. + at_the_function_name=False, # Use "name:" not "@name". + check_indent="", # No indentation for IR files (not MIR files). + ) + is_in_function_start = False if is_in_function: From a4105707eeaf53c13c2f09298d762995267c7717 Mon Sep 17 00:00:00 2001 From: hev <wangrui@loongson.cn> Date: Wed, 5 Nov 2025 20:36:38 +0800 Subject: [PATCH 308/313] [llvm][LoongArch] Introduce LASX and LSX conversion intrinsics (#157818) This patch introduces the LASX and LSX conversion intrinsics: - <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>) - <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>) - <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>) - <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>) - <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>) - <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>) - <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>) - <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>) - <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>) - <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>) - <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>) - <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>) - <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>) - <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>) - <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>) - <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>) - <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>) - <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>) --- llvm/include/llvm/IR/IntrinsicsLoongArch.td | 38 +++ .../LoongArch/LoongArchISelLowering.cpp | 5 + .../LoongArch/LoongArchLASXInstrInfo.td | 31 ++ .../LoongArch/lasx/intrinsic-conversion.ll | 303 ++++++++++++++++++ 4 files changed, 377 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td index 84026aa9d3624..1c46965d995fe 100644 --- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td +++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td @@ -1192,4 +1192,42 @@ def int_loongarch_lasx_xvstelm_w def int_loongarch_lasx_xvstelm_d : VecInt<[], [llvm_v4i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; + +// LASX and LSX conversion +def int_loongarch_lasx_cast_128_s + : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_cast_128_d + : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_cast_128 + : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128_s + : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128_d + : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128 + : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo_s + : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo_d + : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo + : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi_s + : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi_d + : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi + : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo_s + : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo_d + : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo + : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi_s + : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi_d + : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi + : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; } // TargetPrefix = "loongarch" diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index fe700e17d341b..cf4ffc82f6009 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -6630,6 +6630,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0), N->getOperand(1)); break; + case Intrinsic::loongarch_lasx_concat_128_s: + case Intrinsic::loongarch_lasx_concat_128_d: + case Intrinsic::loongarch_lasx_concat_128: + return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), + N->getOperand(1), N->getOperand(2)); } return SDValue(); } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index b502b056c4cdf..00d52870f1727 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -2113,6 +2113,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64, 2, sub_128>; defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8, sub_128>; defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8, 16, sub_128>; +// LASX and LSX conversion +def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; +def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; +def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; } // Predicates = [HasExtLASX] /// Intrinsic pattern diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll new file mode 100644 index 0000000000000..006713ccabf47 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll @@ -0,0 +1,303 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>) + +define void @lasx_cast_128_s(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_cast_128_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x float>, ptr %va + %b = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %a) + store <8 x float> %b, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>) + +define void @lasx_cast_128_d(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_cast_128_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x double>, ptr %va + %b = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %a) + store <4 x double> %b, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>) + +define void @lasx_cast_128(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_cast_128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x i64>, ptr %va + %b = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %a) + store <4 x i64> %b, ptr %vd + ret void +} + +declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>) + +define void @lasx_concat_128_s(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_concat_128_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>) + +define void @lasx_concat_128_d(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_concat_128_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>) + +define void @lasx_concat_128(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_concat_128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void +} + +declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>) + +define void @lasx_extract_128_lo_s(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_lo_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %c = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %a) + store <4 x float> %c, ptr %vd + ret void +} + +declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>) + +define void @lasx_extract_128_lo_d(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_lo_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %c = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %a) + store <2 x double> %c, ptr %vd + ret void +} + +declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>) + +define void @lasx_extract_128_lo(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_lo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %a) + store <2 x i64> %c, ptr %vd + ret void +} + +declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>) + +define void @lasx_extract_128_hi_s(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_hi_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %c = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %a) + store <4 x float> %c, ptr %vd + ret void +} + +declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>) + +define void @lasx_extract_128_hi_d(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_hi_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %c = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %a) + store <2 x double> %c, ptr %vd + ret void +} + +declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>) + +define void @lasx_extract_128_hi(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_hi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %a) + store <2 x i64> %c, ptr %vd + ret void +} + +declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>) + +define void @lasx_insert_128_lo_s(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_lo_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>) + +define void @lasx_insert_128_lo_d(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_lo_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>) + +define void @lasx_insert_128_lo(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_lo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void +} + +declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>) + +define void @lasx_insert_128_hi_s(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_hi_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>) + +define void @lasx_insert_128_hi_d(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_hi_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>) + +define void @lasx_insert_128_hi(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_hi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void +} From 7844829184f16feaf5ba6f8fbb23308abd03a0ba Mon Sep 17 00:00:00 2001 From: Debadri Basak <debadribasak@google.com> Date: Wed, 5 Nov 2025 12:38:30 +0000 Subject: [PATCH 309/313] Adding the missing origin count logic to AnalysisBasedWarnings --- .../Analyses/LifetimeSafety/LifetimeSafety.h | 16 ++++------ .../Analyses/LifetimeSafety/Origins.h | 7 +++-- .../clang/Sema/AnalysisBasedWarnings.h | 9 ++++++ .../LifetimeSafety/LifetimeSafety.cpp | 27 +++++------------ clang/lib/Analysis/LifetimeSafety/Origins.cpp | 16 +++++++--- clang/lib/Sema/AnalysisBasedWarnings.cpp | 30 +++++++++++++++++-- 6 files changed, 67 insertions(+), 38 deletions(-) diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index 7490df90a3282..eb532bc8be3a7 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -48,10 +48,6 @@ class LifetimeSafetyReporter { Confidence Confidence) {} }; -/// The main entry point for the analysis. -void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC, - LifetimeSafetyReporter *Reporter); - namespace internal { /// An object to hold the factories for immutable collections, ensuring /// that all created states share the same underlying memory management. @@ -64,8 +60,6 @@ struct LifetimeFactory { /// Running the lifetime safety analysis and querying its results. It /// encapsulates the various dataflow analyses. class LifetimeSafetyAnalysis { -private: - static llvm::StringMap<int> MissingOriginCount; public: LifetimeSafetyAnalysis(AnalysisDeclContext &AC, @@ -80,10 +74,6 @@ class LifetimeSafetyAnalysis { LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; } FactManager &getFactManager() { return FactMgr; } - static void PrintStats(llvm::raw_ostream &OS); - - static void UpdateMissingOriginCount(const OriginManager &OM); - private: AnalysisDeclContext &AC; LifetimeSafetyReporter *Reporter; @@ -93,6 +83,12 @@ class LifetimeSafetyAnalysis { std::unique_ptr<LoanPropagationAnalysis> LoanPropagation; }; } // namespace internal + +/// The main entry point for the analysis. +std::unique_ptr<internal::LifetimeSafetyAnalysis> +runLifetimeSafetyAnalysis(AnalysisDeclContext &AC, + LifetimeSafetyReporter *Reporter); + } // namespace clang::lifetimes #endif // LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h index 231cc60b7e097..79a345a591adc 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h @@ -79,7 +79,10 @@ class OriginManager { void dump(OriginID OID, llvm::raw_ostream &OS) const; - const llvm::StringMap<int> getMissingOrigins() const; + const llvm::StringMap<unsigned> getMissingOrigins() const; + + // Utility function to check if an origin is missing for a given expression. + bool isOriginMissing(const Expr &E) const; private: OriginID getNextOriginID() { return NextOriginID++; } @@ -90,7 +93,7 @@ class OriginManager { llvm::SmallVector<Origin> AllOrigins; llvm::DenseMap<const clang::ValueDecl *, OriginID> DeclToOriginID; llvm::DenseMap<const clang::Expr *, OriginID> ExprToOriginID; - llvm::StringMap<int> ExprTypeToMissingOriginCount; + llvm::StringMap<unsigned> ExprTypeToMissingOriginCount; }; } // namespace clang::lifetimes::internal diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h index 4103c3f006a8f..604039ef61cb7 100644 --- a/clang/include/clang/Sema/AnalysisBasedWarnings.h +++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h @@ -14,7 +14,10 @@ #define LLVM_CLANG_SEMA_ANALYSISBASEDWARNINGS_H #include "clang/AST/Decl.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Facts.h" +#include "clang/Analysis/AnalysisDeclContext.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringMap.h" #include <memory> namespace clang { @@ -95,6 +98,9 @@ class AnalysisBasedWarnings { /// a single function. unsigned MaxUninitAnalysisBlockVisitsPerFunction; + /// Map from expressions missing origin in OriginManager to their counts. + llvm::StringMap<unsigned> MissingOriginCount; + /// @} public: @@ -116,6 +122,9 @@ class AnalysisBasedWarnings { Policy &getPolicyOverrides() { return PolicyOverrides; } void PrintStats() const; + + void FindMissingOrigins(AnalysisDeclContext &AC, + clang::lifetimes::internal::FactManager &FactMgr); }; } // namespace sema diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp index 828c08d1cbeed..d183ce976f946 100644 --- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp @@ -33,25 +33,10 @@ namespace clang::lifetimes { namespace internal { -llvm::StringMap<int> LifetimeSafetyAnalysis::MissingOriginCount; - LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC, LifetimeSafetyReporter *Reporter) : AC(AC), Reporter(Reporter) {} -void LifetimeSafetyAnalysis::PrintStats(llvm::raw_ostream &OS) { - llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats " - "(expression_type : count) :\n"; - for (const auto &[expr, count] : LifetimeSafetyAnalysis::MissingOriginCount) { - OS << expr << " : " << count << '\n'; - } - } - -void LifetimeSafetyAnalysis::UpdateMissingOriginCount(const OriginManager &OM) { - for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) { - LifetimeSafetyAnalysis::MissingOriginCount[std::string(expr)] += missing_origin_count; - } - } void LifetimeSafetyAnalysis::run() { llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis"); @@ -83,13 +68,15 @@ void LifetimeSafetyAnalysis::run() { LiveOrigins->dump(llvm::dbgs(), FactMgr.getTestPoints())); runLifetimeChecker(*LoanPropagation, *LiveOrigins, FactMgr, AC, Reporter); - UpdateMissingOriginCount(FactMgr.getOriginMgr()); } } // namespace internal -void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC, - LifetimeSafetyReporter *Reporter) { - internal::LifetimeSafetyAnalysis Analysis(AC, Reporter); - Analysis.run(); +std::unique_ptr<internal::LifetimeSafetyAnalysis> +runLifetimeSafetyAnalysis(AnalysisDeclContext &AC, + LifetimeSafetyReporter *Reporter) { + std::unique_ptr<internal::LifetimeSafetyAnalysis> Analysis = + std::make_unique<internal::LifetimeSafetyAnalysis>(AC, Reporter); + Analysis->run(); + return Analysis; } } // namespace clang::lifetimes diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp index abe067a829cb7..9e314d984b74f 100644 --- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" +#include "clang/AST/Expr.h" #include "clang/AST/TypeBase.h" #include "llvm/ADT/StringMap.h" @@ -24,7 +25,7 @@ void OriginManager::dump(OriginID OID, llvm::raw_ostream &OS) const { OS << ")"; } -const llvm::StringMap<int> OriginManager::getMissingOrigins() const { +const llvm::StringMap<unsigned> OriginManager::getMissingOrigins() const { return ExprTypeToMissingOriginCount; } @@ -46,10 +47,12 @@ OriginID OriginManager::get(const Expr &E) { // if the expression has no specific origin, increment the missing origin // counter. - const QualType ExprType = E.getType(); - auto CountIt = ExprTypeToMissingOriginCount.find(ExprType.getAsString()); + // const QualType ExprType = E.getType(); + std::string ExprStr(E.getStmtClassName()); + ExprStr = ExprStr + "<" + E.getType().getAsString() + ">"; + auto CountIt = ExprTypeToMissingOriginCount.find(ExprStr); if (CountIt == ExprTypeToMissingOriginCount.end()) { - ExprTypeToMissingOriginCount[ExprType.getAsString()] = 1; + ExprTypeToMissingOriginCount[ExprStr] = 1; } else { CountIt->second++; } @@ -102,4 +105,9 @@ OriginID OriginManager::getOrCreate(const ValueDecl &D) { return NewID; } +bool OriginManager::isOriginMissing(const Expr &E) const { + auto It = ExprToOriginID.find(&E); + return It == ExprToOriginID.end(); +} + } // namespace clang::lifetimes::internal diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 009994e189220..157d63b06137d 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -29,8 +29,10 @@ #include "clang/Analysis/Analyses/CFGReachabilityAnalysis.h" #include "clang/Analysis/Analyses/CalledOnceCheck.h" #include "clang/Analysis/Analyses/Consumed.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Facts.h" #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h" #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" +#include "clang/Analysis/Analyses/PostOrderCFGView.h" #include "clang/Analysis/Analyses/ReachableCode.h" #include "clang/Analysis/Analyses/ThreadSafety.h" #include "clang/Analysis/Analyses/UninitializedValues.h" @@ -53,6 +55,7 @@ #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> @@ -3067,7 +3070,11 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( if (EnableLifetimeSafetyAnalysis && S.getLangOpts().CPlusPlus) { if (AC.getCFG()) { lifetimes::LifetimeSafetyReporterImpl LifetimeSafetyReporter(S); - lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter); + std::unique_ptr<clang::lifetimes::internal::LifetimeSafetyAnalysis> + Analysis = + lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter); + if (S.CollectStats) + FindMissingOrigins(AC, Analysis->getFactManager()); } } // Check for violations of "called once" parameter properties. @@ -3133,8 +3140,27 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( } } +void clang::sema::AnalysisBasedWarnings::FindMissingOrigins( + AnalysisDeclContext &AC, lifetimes::internal::FactManager &FactMgr) { + if (AC.getCFG()) { + for (const auto &[expr, count] : + FactMgr.getOriginMgr().getMissingOrigins()) { + MissingOriginCount[expr] += count; + } + } +} + void clang::sema::AnalysisBasedWarnings::PrintStats() const { - clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs()); + // clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs()); + llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats " + "(expression_type : count) :\n"; + unsigned totalMissingOrigins = 0; + for (const auto &[expr, count] : MissingOriginCount) { + llvm::errs() << expr << " : " << count << '\n'; + totalMissingOrigins += count; + } + llvm::errs() << "Total missing origins: " << totalMissingOrigins << "\n"; + llvm::errs() << "****************************************\n"; llvm::errs() << "\n*** Analysis Based Warnings Stats:\n"; unsigned NumCFGsBuilt = NumFunctionsAnalyzed - NumFunctionsWithBadCFGs; unsigned AvgCFGBlocksPerFunction = From 5b478d13bba0960fcf2eab472381e097638580dd Mon Sep 17 00:00:00 2001 From: Debadri Basak <debadribasak@google.com> Date: Mon, 3 Nov 2025 13:11:00 +0000 Subject: [PATCH 310/313] Adding implementation for missing origin statsitics on LifetimeSafetyAnalysis --- .../Analyses/LifetimeSafety/LifetimeSafety.h | 21 +++++++++++++++++++ .../Analyses/LifetimeSafety/Origins.h | 6 ++++++ .../LifetimeSafety/LifetimeSafety.cpp | 5 +++++ clang/lib/Analysis/LifetimeSafety/Origins.cpp | 16 ++++++++++++++ clang/lib/Sema/AnalysisBasedWarnings.cpp | 4 +++- 5 files changed, 51 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index 91ffbb169f947..4952d84a80369 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -23,7 +23,11 @@ #include "clang/Analysis/Analyses/LifetimeSafety/Facts.h" #include "clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h" #include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" #include "clang/Analysis/AnalysisDeclContext.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/raw_ostream.h" +#include <string> namespace clang::lifetimes { @@ -60,6 +64,9 @@ struct LifetimeFactory { /// Running the lifetime safety analysis and querying its results. It /// encapsulates the various dataflow analyses. class LifetimeSafetyAnalysis { +private: + static llvm::StringMap<int> MissingOriginCount; + public: LifetimeSafetyAnalysis(AnalysisDeclContext &AC, LifetimeSafetyReporter *Reporter); @@ -73,6 +80,20 @@ class LifetimeSafetyAnalysis { LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; } FactManager &getFactManager() { return FactMgr; } + static void PrintStats(llvm::raw_ostream &OS) { + llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats " + "(expression_type : count) :\n"; + for (const auto &[expr, count] : LifetimeSafetyAnalysis::count) { + OS << expr << " : " << count << '\n'; + } + } + + static void UpdateMissingOriginCount(const OriginManager &OM) { + for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) { + LifetimeSafetyAnalysis::count[std::string(expr)] += missing_origin_count; + } + } + private: AnalysisDeclContext &AC; LifetimeSafetyReporter *Reporter; diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h index ba138b078b379..231cc60b7e097 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h @@ -16,7 +16,10 @@ #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" +#include "clang/AST/TypeBase.h" #include "clang/Analysis/Analyses/LifetimeSafety/Utils.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/raw_ostream.h" namespace clang::lifetimes::internal { @@ -76,6 +79,8 @@ class OriginManager { void dump(OriginID OID, llvm::raw_ostream &OS) const; + const llvm::StringMap<int> getMissingOrigins() const; + private: OriginID getNextOriginID() { return NextOriginID++; } @@ -85,6 +90,7 @@ class OriginManager { llvm::SmallVector<Origin> AllOrigins; llvm::DenseMap<const clang::ValueDecl *, OriginID> DeclToOriginID; llvm::DenseMap<const clang::Expr *, OriginID> ExprToOriginID; + llvm::StringMap<int> ExprTypeToMissingOriginCount; }; } // namespace clang::lifetimes::internal diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp index 00c7ed90503e7..a76fdd2535d97 100644 --- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp @@ -23,14 +23,18 @@ #include "clang/Analysis/AnalysisDeclContext.h" #include "clang/Analysis/CFG.h" #include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/StringMap.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TimeProfiler.h" +#include "llvm/Support/raw_ostream.h" #include <memory> namespace clang::lifetimes { namespace internal { +llvm::StringMap<int> LifetimeSafetyAnalysis::MissingOriginCount; + LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC, LifetimeSafetyReporter *Reporter) : AC(AC), Reporter(Reporter) {} @@ -66,6 +70,7 @@ void LifetimeSafetyAnalysis::run() { LiveOrigins->dump(llvm::dbgs(), FactMgr.getTestPoints())); runLifetimeChecker(*LoanPropagation, *LiveOrigins, FactMgr, AC, Reporter); + UpdateMissingOriginCount(FactMgr.getOriginMgr()); } } // namespace internal diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp index ea51a75324e06..abe067a829cb7 100644 --- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" +#include "clang/AST/TypeBase.h" +#include "llvm/ADT/StringMap.h" namespace clang::lifetimes::internal { @@ -22,6 +24,10 @@ void OriginManager::dump(OriginID OID, llvm::raw_ostream &OS) const { OS << ")"; } +const llvm::StringMap<int> OriginManager::getMissingOrigins() const { + return ExprTypeToMissingOriginCount; +} + Origin &OriginManager::addOrigin(OriginID ID, const clang::ValueDecl &D) { AllOrigins.emplace_back(ID, &D); return AllOrigins.back(); @@ -37,6 +43,16 @@ OriginID OriginManager::get(const Expr &E) { auto It = ExprToOriginID.find(&E); if (It != ExprToOriginID.end()) return It->second; + + // if the expression has no specific origin, increment the missing origin + // counter. + const QualType ExprType = E.getType(); + auto CountIt = ExprTypeToMissingOriginCount.find(ExprType.getAsString()); + if (CountIt == ExprTypeToMissingOriginCount.end()) { + ExprTypeToMissingOriginCount[ExprType.getAsString()] = 1; + } else { + CountIt->second++; + } // If the expression itself has no specific origin, and it's a reference // to a declaration, its origin is that of the declaration it refers to. // For pointer types, where we don't pre-emptively create an origin for the diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 140b709dbb651..009994e189220 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -30,6 +30,7 @@ #include "clang/Analysis/Analyses/CalledOnceCheck.h" #include "clang/Analysis/Analyses/Consumed.h" #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" #include "clang/Analysis/Analyses/ReachableCode.h" #include "clang/Analysis/Analyses/ThreadSafety.h" #include "clang/Analysis/Analyses/UninitializedValues.h" @@ -53,6 +54,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include <algorithm> #include <deque> #include <iterator> @@ -3132,8 +3134,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( } void clang::sema::AnalysisBasedWarnings::PrintStats() const { + clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs()); llvm::errs() << "\n*** Analysis Based Warnings Stats:\n"; - unsigned NumCFGsBuilt = NumFunctionsAnalyzed - NumFunctionsWithBadCFGs; unsigned AvgCFGBlocksPerFunction = !NumCFGsBuilt ? 0 : NumCFGBlocks/NumCFGsBuilt; From 20bc6f2c9b89b971de90ac1bd695ddb4659442e5 Mon Sep 17 00:00:00 2001 From: Debadri Basak <debadribasak@google.com> Date: Mon, 3 Nov 2025 13:23:42 +0000 Subject: [PATCH 311/313] Minor refactoring of the static functions --- .../Analyses/LifetimeSafety/LifetimeSafety.h | 14 ++------------ .../lib/Analysis/LifetimeSafety/LifetimeSafety.cpp | 13 +++++++++++++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index 4952d84a80369..7490df90a3282 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -80,19 +80,9 @@ class LifetimeSafetyAnalysis { LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; } FactManager &getFactManager() { return FactMgr; } - static void PrintStats(llvm::raw_ostream &OS) { - llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats " - "(expression_type : count) :\n"; - for (const auto &[expr, count] : LifetimeSafetyAnalysis::count) { - OS << expr << " : " << count << '\n'; - } - } + static void PrintStats(llvm::raw_ostream &OS); - static void UpdateMissingOriginCount(const OriginManager &OM) { - for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) { - LifetimeSafetyAnalysis::count[std::string(expr)] += missing_origin_count; - } - } + static void UpdateMissingOriginCount(const OriginManager &OM); private: AnalysisDeclContext &AC; diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp index a76fdd2535d97..828c08d1cbeed 100644 --- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp @@ -39,6 +39,19 @@ LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC, LifetimeSafetyReporter *Reporter) : AC(AC), Reporter(Reporter) {} +void LifetimeSafetyAnalysis::PrintStats(llvm::raw_ostream &OS) { + llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats " + "(expression_type : count) :\n"; + for (const auto &[expr, count] : LifetimeSafetyAnalysis::MissingOriginCount) { + OS << expr << " : " << count << '\n'; + } + } + +void LifetimeSafetyAnalysis::UpdateMissingOriginCount(const OriginManager &OM) { + for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) { + LifetimeSafetyAnalysis::MissingOriginCount[std::string(expr)] += missing_origin_count; + } + } void LifetimeSafetyAnalysis::run() { llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis"); From 41d835c613f521295f3a59bc06bb7c21d0c80595 Mon Sep 17 00:00:00 2001 From: Debadri Basak <debadribasak@google.com> Date: Wed, 5 Nov 2025 13:09:03 +0000 Subject: [PATCH 312/313] Correcting merging errors --- .../Analysis/LifetimeSafety/LifetimeSafety.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp index 1e7d8f976c33f..744cf7537f8cb 100644 --- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp @@ -33,25 +33,10 @@ namespace clang::lifetimes { namespace internal { -llvm::StringMap<int> LifetimeSafetyAnalysis::MissingOriginCount; - LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC, LifetimeSafetyReporter *Reporter) : AC(AC), Reporter(Reporter) {} -void LifetimeSafetyAnalysis::PrintStats(llvm::raw_ostream &OS) { - llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats " - "(expression_type : count) :\n"; - for (const auto &[expr, count] : LifetimeSafetyAnalysis::MissingOriginCount) { - OS << expr << " : " << count << '\n'; - } - } - -void LifetimeSafetyAnalysis::UpdateMissingOriginCount(const OriginManager &OM) { - for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) { - LifetimeSafetyAnalysis::MissingOriginCount[std::string(expr)] += missing_origin_count; - } - } void LifetimeSafetyAnalysis::run() { llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis"); From 5efc02004d67debbf75f4a9bd06a41bdf4488c32 Mon Sep 17 00:00:00 2001 From: Debadri Basak <debadri1010@gmail.com> Date: Wed, 5 Nov 2025 13:24:22 +0000 Subject: [PATCH 313/313] Removing the obsolete static functions from LifetimeSafetyAnalysis --- .../include/clang/Analysis/Analyses/LifetimeSafety/Origins.h | 2 +- clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp | 1 - clang/lib/Analysis/LifetimeSafety/Origins.cpp | 5 ----- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h index 3f8c8a4d7ce9b..26686a63e9204 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h @@ -79,7 +79,7 @@ class OriginManager { void dump(OriginID OID, llvm::raw_ostream &OS) const; - const llvm::StringMap<int> getMissingOrigins() const; + const llvm::StringMap<unsigned> getMissingOrigins() const; private: OriginID getNextOriginID() { return NextOriginID++; } diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp index 744cf7537f8cb..d183ce976f946 100644 --- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp @@ -68,7 +68,6 @@ void LifetimeSafetyAnalysis::run() { LiveOrigins->dump(llvm::dbgs(), FactMgr.getTestPoints())); runLifetimeChecker(*LoanPropagation, *LiveOrigins, FactMgr, AC, Reporter); - UpdateMissingOriginCount(FactMgr.getOriginMgr()); } } // namespace internal diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp index 9e314d984b74f..b8f705e3377c2 100644 --- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp @@ -105,9 +105,4 @@ OriginID OriginManager::getOrCreate(const ValueDecl &D) { return NewID; } -bool OriginManager::isOriginMissing(const Expr &E) const { - auto It = ExprToOriginID.find(&E); - return It == ExprToOriginID.end(); -} - } // namespace clang::lifetimes::internal
NumberSection Status Issue title Available in Clang?
1[dcl.fct.default] TC1 What if two using-declarations refer to the same function but the declarations introduce different default-arguments? No
2[temp.dep.res] drafting How can dependent names be used in member declarations that appear outside of the class template definition? Not resolved
3[temp.expl.spec] NAD The template compilation model rules render some explicit specialization declarations not visible during instantiation Clang 2.7
4[dcl.link] CD1 Does extern "C" affect the linkage of function names with internal linkage? Clang 2.8
5[dcl.init] CD1 CV-qualifiers and type conversions Clang 3.1
6[class.copy.elision] NAD Should the optimization that allows a class object to alias another object also allow the case of a parameter in an inline function to alias its argument? Yes
7[class.access.base] NAD Can a class with a private virtual base class be derived from? Clang 3.4
8[class.access] CD1 Access to template arguments used in a function return type and in the nested name specifier Duplicate of 45
9[class.access.base] CD1 Clarification of access to base class members Clang 2.8
10[class.access.nest] CD1 Can a nested class access its own class name as a qualified name if it is a private member of the enclosing class? Duplicate of 45
11[namespace.udecl] CD1 How do the keywords typename/template interact with using-declarations? Clang 2.7
12[basic.lookup.argdep] dup Default arguments on different declarations for the same function and the Koenig lookup Superseded by 239
13[dcl.link] NAD extern "C" for Parameters of Function Templates No
14[dcl.link] NAD extern "C" functions and declarations in different namespaces Clang 3.4
15[dcl.fct.default] dup Default arguments for parameters of function templates Clang 2.7
16[class.access.base] CD1 Access to members of indirect private base classes Clang 2.8
17[class.access.base] NAD Footnote 99 should discuss the naming class when describing members that can be accessed from friends Clang 2.7
18[dcl.fct] NAD f(TYPE) where TYPE is void should be allowed Superseded by 577
19[class.protected] NAD Clarify protected member access Clang 3.1
20[class.copy.ctor] TC1 Some clarifications needed for 12.8 para 15 Clang 2.8
21[temp.param] TC1 Can a default argument for a template parameter appear in a friend declaration? Clang 3.4
22[temp.dep.res] TC1 Template parameter with a default argument that refers to itself Superseded by 481
23[temp.func.order] NAD Some questions regarding partial ordering of function templates Clang 2.7
24[temp.expl.spec] TC1 Errors in examples in 14.7.3 N/A
25[except.spec] TC1 Exception specifications and pointers to members Clang 4
26[class.copy.ctor] NAD Copy constructors and default arguments Clang 2.7
27[over.built] NAD Overload ambiguities for builtin ?: prototypes Clang 2.7
28[basic.start.dynamic] CD1 'exit', 'signal' and static object destruction N/A (Library DR)
29[dcl.link] CD1 Linkage of locally declared functions Clang 3.4
30[temp.names] TC1 Valid uses of "::template" Superseded by 468 (C++11 onwards)
31[expr.new] NAD Looking up new/delete Clang 2.8
32[temp] TC1 Clarification of explicit instantiation of non-exported templates N/A
33[basic.lookup.argdep] TC1 Argument dependent lookup and overloaded functions Clang 9
34[temp.inst] NAD Argument dependent lookup and points of instantiation N/A
35[dcl.init] TC1 Definition of default-initialization Duplicate of 178
36[namespace.udecl] CD6 using-declarations in multiple-declaration contexts Clang 2.8
37[except.uncaught] NAD When is uncaught_exception() true? Superseded by 475
38[temp.names] TC1 Explicit template arguments and operator functions Clang 2.7
39[class.member.lookup] CD1 Conflicting ambiguity rules No
40[dcl.meaning] TC1 Syntax of declarator-id N/A
41[basic.lookup.unqual] TC1 Clarification of lookup of names after declarator-id Clang 2.7
42[basic.scope.class] NAD Redefining names from base classes Clang 2.7
43[basic.types] TC1 Copying base classes (PODs) using memcpy N/A
44[temp.expl.spec] CD1 Member specializations Superseded by 727
45[class.access.nest] CD1 Access to nested classes Clang 2.7
46[temp.explicit] NAD Explicit instantiation of member templates Clang 2.7
47[temp.friend] NAD Template friend issues Superseded by 329
48[class.static.data] TC1 Definitions of unused static members Clang 2.7
49[temp.param] TC1 Restriction on non-type, non-value template arguments Clang 2.8
50[basic.def.odr] NAD Converting pointer to incomplete type to same type Clang 2.7
51[over.match.best] TC1 Overloading and user-defined conversions Clang 2.8
52[expr.ref] TC1 Non-static members, member selection and access checking Clang 2.8
53[expr.static.cast] TC1 Lvalue-to-rvalue conversion before certain static_casts Clang 2.7
54[expr.static.cast] CD1 Static_cast from private base to derived class Clang 2.8
55[expr.add] NAD Adding/subtracting pointer and enumeration value Clang 2.7
56[dcl.typedef] TC1 Redeclaring typedefs within classes Clang 2.7
57[class.union] open Empty unions Not resolved
58[class.bit] CD1 Signedness of bit fields of enum type Clang 3.1
59[over.match.copy] TC1 Clarification of overloading and UDC to reference type Clang 2.7
60[over.ics.ref] CD1 Reference binding and valid conversion sequences Clang 2.7
61[over.over] NAD Address of static member function "&p->f" Clang 3.4
62[temp.arg.type] CD1 Unnamed members of classes used as type parameters Clang 2.9
63[temp.inst] CD1 Class instantiation from pointer conversion to void*, null and self Clang 2.7
64[temp.expl.spec] TC1 Partial ordering to disambiguate explicit specialization Clang 2.7
65[dcl.fct.default] TC1 Typo in default argument example N/A
66[dcl.fct.default] NAD Visibility of default args vs overloads added after using-declaration No
67[class.static] TC1 Evaluation of left side of object-expression N/A
68[dcl.type.elab] TC1 Grammar does not allow "friend class A<int>;" Clang 2.8
69[dcl.stc] TC1 Storage class specifiers on template declarations Clang 9
70[temp.deduct.type] CD1 Is an array bound a nondeduced context? Clang 2.7
71[expr] NAD Incorrect cross reference N/A
72[temp] dup Linkage and storage class specifiers for templates Duplicate of 69
73[expr.eq] TC1 Pointer equality Superseded by 1652
74[expr.new] TC1 Enumeration value in direct-new-declarator Clang 2.7
75[class.mem] TC1 In-class initialized members must be const Clang 2.7
76[dcl.type.cv] TC1 Are const volatile variables considered "constant expressions"? Clang 2.7
77[class.friend] CD1 The definition of friend does not allow nested classes to be friends Clang 2.7
78[dcl.init] CD1 Section 8.5 paragraph 9 should state it only applies to non-static objects Superseded by ????
79[new.delete.placement] dup Alignment and placement new N/A
80[class.mem] TC1 Class members with same name as class Clang 2.9
81[diff] NAD Null pointers and C compatibility N/A
82[basic.def.odr] dup Definition of "using" a constant expression Duplicate of 48
83[over.ics.rank] TC1 Overloading and deprecated conversion of string literal Clang 2.7
84[over.best.ics] TC1 Overloading and conversion loophole used by auto_ptr Clang 2.7
85[basic.lookup.elab] TC1 Redeclaration of member class Clang 3.4
86[class.temporary] CD1 Lifetime of temporaries in query expressions Duplicate of 446
87[except.spec] CD1 Exception specifications on function parameters No
88[temp.expl.spec] NAD Specialization of member constant templates Clang 2.8
89[basic.life] TC1 Object lifetime does not account for reference rebinding N/A
90[basic.lookup.argdep] TC1 Should the enclosing class be an "associated class" too? Clang 2.7
91[basic.lookup.argdep] NAD A union's associated types should include the union itself Clang 2.7
92[except.spec] CD4 Should exception-specifications be part of the type system? Clang 4 (C++17 onwards)
93[basic.life] TC1 Missing word in 3.8 basic.life paragraph 2 N/A
94[expr.const] TC1 Inconsistencies in the descriptions of constant expressions Clang 2.7
95[namespace.memdef] NAD Elaborated type specifiers referencing names declared in friend decls Clang 3.3
96[temp.names] C++11 Syntactic disambiguation using the template keyword Superseded by P1787
97[expr.const] NAD Use of bool constants in integral constant expressions Clang 2.7
98[except] TC1 Branching into try block Clang 2.7
99[temp.deduct.call] NAD Partial ordering, references and cv-qualifiers Superseded by 214
100[temp.arg.nontype] TC1 Clarify why string literals are not allowed as template arguments Clang 2.7
101[namespace.udecl] TC1 Redeclaration of extern "C" names via using-declarations Clang 3.5
102[over.match.oper] NAD Operator lookup rules do not work well with parts of the library Clang 2.7
103[namespace.udir] TC1 Is it extended-namespace-definition or extension-namespace-definition ? N/A
104[except.throw] NAD Destroying the exception temp when no handler is found N/A (Library DR)
105[temp] TC1 Meaning of "template function" N/A
106[unknown] CD1 Creating references to references during template deduction/instantiation Superseded by 540
107[dcl.link] NAD Linkage of operator functions Clang 2.7
108[temp.dep.type] TC1 Are classes nested in templates dependent? Clang 2.9
109[namespace.udecl] NAD Allowing ::template in using-declarations Clang 2.8
110[temp] CD6 Can template functions and classes be declared in the same scope? Clang 2.8
111[class.copy.ctor] NAD Copy constructors and cv-qualifiers Duplicate of 535
112[dcl.array] CD1 Array types and cv-qualifiers Clang 3.1
113[expr.call] CD1 Visibility of called function Clang 2.7
114[temp.mem] NAD Virtual overriding by template member function specializations Clang 2.7
115[over.over] CD1 Address of template-id Clang 3.0
116[temp.over.link] TC1 Equivalent and functionally-equivalent function templates Clang 2.7
117[class.temporary] NAD Timing of destruction of temporaries N/A
118[expr.call] CD1 Calls via pointers to virtual member functions Yes
119[basic.life] CD1 Object lifetime and aggregate initialization N/A
120[temp.res] TC1 Nonexistent non-terminal qualified-name N/A
121[temp.res] TC1 Dependent type names with non-dependent nested-name-specifiers Clang 2.7
122[expr.prim.general] CD1 template-ids as unqualified-ids Clang 2.7
123[expr.prim.general] TC1 Bad cross-reference N/A
124[class.temporary] CD1 Lifetime of temporaries in default initialization of class arrays Clang 2.7
125[expr.prim.general] CD1 Ambiguity in friend declaration syntax Clang 2.7
126[except.spec] TC1 Exception specifications and const Partial
127[expr.new] TC1 Ambiguity in description of matching deallocation function Clang 2.9
128[expr.static.cast] TC1 Casting between enum types Clang 2.7
129[intro.execution] CD3 Stability of uninitialized auto variables Duplicate of 616
130[expr.new] NAD Sequence points and new-expressions N/A
131[extendid] TC1 Typo in Lao characters Superseded by P1949
132[basic.link] NAD Local types and linkage No
133[except.spec] dup Exception specifications and checking Duplicate of 87
134[temp] TC1 Template classes and declarator-ids N/A
135[dcl.fct] TC1 Class type in in-class member function definitions Clang 2.7
136[dcl.fct.default] CD1 Default arguments and friend declarations Clang 3.4
137[expr.static.cast] TC1 static_cast of cv void* Clang 2.7
138[namespace.memdef] CD6 Friend declaration name lookup Partial
139[basic.lookup.unqual] CD1 Error in friend lookup example Clang 2.7
140[dcl.fct] CD1 Agreement of parameter declarations Clang 2.7
141[basic.lookup.classref] CD1 Non-member function templates in member access expressions Clang 3.1
142[class.access.base] TC1 Injection-related errors in access example Clang 2.8
143[basic.lookup.argdep] CD1 Friends and Koenig lookup Clang 2.7
144[dcl.type.elab] open Position of friend specifier Not resolved
145[depr.impldec] TC1 Deprecation of prefix ++ Clang 2.7
146[basic.fundamental] open Floating-point zero Not resolved
147[expr.prim.general] TC1 Naming the constructor Clang 2.7
148[class] TC1 POD classes and pointers to members Clang 2.7
149[conv.ptr] TC1 Accessibility and ambiguity N/A
150[temp.arg.template] C++17 Template template parameters and default arguments Clang 19
151[dcl.init] TC1 Terminology of zero-initialization Clang 3.1
152[class.conv.ctor] TC1 explicit copy constructors Clang 2.7
153[over.ics.rank] TC1 Misleading wording (rank of conversion) N/A
154[dcl.stc] NAD Anonymous unions in unnamed namespaces Clang 2.7
155[dcl.init] dup Brace initializer for scalar Duplicate of 632
156[basic.lookup.classref] NAD Name lookup for conversion functions Superseded by 1111
157[dcl.pre] open Omitted typedef declarator Not resolved
158[basic.lval] CD1 Aliasing and qualification conversions Yes
159[dcl.meaning] TC1 Namespace qualification in declarators Clang 3.5
160[dcl.ambig.res] CD1 Missing std:: qualification N/A
161[class.protected] TC1 Access to protected nested type Clang 3.1
162[over.match.call] CD1 (&C::f)() with nonstatic members Clang 19
163[dcl.init.aggr] TC1 Description of subaggregate initializer N/A
164[basic.lookup.argdep] TC1 Overlap between Koenig and normal lookup Clang 2.7
165[namespace.memdef] NAD Definitions of friends and block-scope externs No
166[namespace.memdef] TC1 Friend declarations of template-ids Clang 2.9
167[depr.static] NAD Deprecating static functions Superseded by 1012
168[dcl.link] NAD C linkage for static member functions No
169[namespace.udecl] NAD template-ids in using-declarations Clang 3.4
170[conv.mem] CD7 Pointer-to-member conversions Clang 3.1
171[basic.namespace] TC1 Global namespace scope Clang 3.4
172[dcl.enum] CD1 Unsigned int as underlying type of enum Clang 2.7
173[lex.charset] TC1 Constraints on execution character set Clang 2.7
174[depr.static] NAD Undeprecating global static Superseded by 1012
175[class] CD1 Class name injection and base name access Clang 2.8
176[class] TC1 Name injection and templates Clang 3.1
177[dcl.init] CD1 Lvalues vs rvalues in copy-initialization Clang 2.7
178[dcl.init] TC1 More on value-initialization Clang 3.1
179[expr.add] TC1 Function pointers and subtraction Clang 2.7
180[temp.res] CD1 typename and elaborated types Clang 2.8
181[temp.deduct.type] TC1 Errors in template template-parameter example Clang 2.7
182[temp.expl.spec] NAD Access checking on explicit specializations Clang 14
183[temp.res] TC1 typename in explicit specializations Superseded by 382
184[temp.param] CD1 Default arguments in template template-parameters Clang 2.7
185[class.copy.ctor] TC1 "Named" temporaries and copy elision Clang 2.7
186[temp.local] open Name hiding and template template-parameters Not resolved
187[temp.param] TC1 Scope of template parameter names Superseded by 481
188[expr.comma] TC1 Comma operator and rvalue conversion Clang 2.7
189[lex.operators] open Definition of operator and punctuator Not resolved
190[class.mem] TC1 Layout-compatible POD-struct types Clang 19
191[basic.lookup.unqual] CD6 Name lookup does not handle complex nesting Clang 2.7
192[basic.lookup.unqual] NAD Name lookup in parameters Clang 2.7
193[class.dtor] TC1 Order of destruction of local automatics of destructor Clang 2.7
194[class.ctor] TC1 Identifying constructors Clang 2.7
195[expr.reinterpret.cast] CD1 Converting between function and object pointers Clang 2.7
196[expr.delete] open Arguments to deallocation functions Not resolved
197[temp.dep.candidate] CD1 Issues with two-stage lookup of dependent names Clang 2.7
198[class.local] CD1 Definition of "use" in local and nested classes Clang 2.9
199[class.temporary] CD1 Order of destruction of temporaries Clang 2.8
200[temp.func.order] dup Partial ordering and explicit arguments Duplicate of 214
201[class.temporary] CD1 Order of destruction of temporaries in initializers Clang 2.8
202[over.over] TC1 Use of overloaded function name Clang 3.1
203[expr.unary.op] NAD Type of address-of-member expression Clang 3.0
204[temp] CD1 Exported class templates Superseded by 820
205[temp] drafting Templates and static data members Not resolved
206[temp.nondep] TC1 Semantic constraints on non-dependent names Clang 2.7
207[class.access.base] CD1 using-declarations and protected access Clang 2.7
208[except.throw] CD1 Rethrowing exceptions in nested handlers Unknown
209[class.friend] NADMust friend declaration names be -accessible?Must friend declaration names be accessible? Clang 3.2
210[except.handle] TC1 What is the type matched by an exception handler? Clang 2.7
211[except] NAD Constructors should not be allowed to return normally after an exception Clang 2.7
212[temp.inst] CD4 Implicit instantiation is not described clearly enough Yes
213[temp.dep] TC1 Lookup in dependent base classes Clang 2.7
214[temp.func.order] CD1 Partial ordering of function templates is underspecified Clang 2.7
215[temp.param] CD1 Template parameters are not allowed in nested-name-specifiers Clang 2.9
216[basic.link] CD1 Linkage of nameless class-scope enumeration types No
217[dcl.fct.default] TC1 Default arguments for non-template member functions of class templates Clang 2.7
218[basic.lookup.argdep] CD1 Specification of Koenig lookup Clang 2.7
219[except.terminate] NAD Cannot defend against destructors that throw exceptions N/A
220[basic.stc.dynamic.deallocation] CD1 All deallocation functions should be required not to throw N/A
221[over.assign] CD1 Must compound assignment operators be member functions? Clang 3.6
222[expr] CD1 Sequence points and lvalue-returning operators Duplicate of 637
223[depr] CD3 The meaning of deprecation N/A
224[temp.dep.type] CD1 Definition of dependent names Clang 16
225[basic.lookup.argdep] NAD Koenig lookup and fundamental types Yes
226[temp.param] CD1 Default template arguments for function templates No
227[stmt.select] TC1 How many scopes in an if statement? Clang 2.7
228[temp.names] CD1 Use of template keyword with non-member templates Clang 2.7
229[temp.spec.partial] NAD Partial specialization of function templates Clang 2.9
230[class.abstract] NAD Calls to pure virtual functions Clang 3.0
231[basic.lookup.unqual] NAD Visibility of names after using-directives Clang 2.7
232[expr.unary.op] NAD Is indirection through a null pointer undefined behavior? Duplicate of 2823
233[dcl.init.ref] CD7 References vs pointers in UDC overload resolution Unknown
234[basic.life] NAD Reuse of base class subobjects N/A
235[class.base.init] TC1 Assignment vs initialization N/A
236[expr.const] NAD Explicit temporaries and integral constant expressions Clang 3.2
237[temp.explicit] CD1 Explicit instantiation and base class members Duplicate of 470
238[expr] CD4 Precision and accuracy constraints on floating point Unknown
239[over.call.func] CD1 Footnote 116 and Koenig lookup Clang 2.7
240[conv.lval] CD3 Uninitialized values and undefined behavior Duplicate of 616
241[temp.arg.explicit] TC1 Error in example in 14.8.1 Clang 9
242[expr.cast] CD4 Interpretation of old-style casts Unknown
243[over.ics.user] NAD Weighting of conversion functions in direct-initialization Clang 2.8
244[class.dtor] CD1 Destructor lookup Clang 11
245[basic.lookup.elab] CD1 Name lookup in elaborated-type-specifiers Clang 2.8
246[temp.arg] CD1 Jumps in function-try-block handlers Clang 3.2
247[over.over] NAD Pointer-to-member casts and function overload resolution Clang 2.7
248[extendid] C++11 Identifier characters Superseded by P1949
249[temp.mem.func] TC1 What is a member function template? Clang 2.7
250[over.over] TC1 Address of function template specialization with non-deduced template arguments Clang 2.7
251[basic.fundamental] open How many signed integer types are there? Not resolved
252[class.dtor] CD1 Looking up deallocation functions in virtual destructors Clang 3.1
253[dcl.init] C++17 Why must empty or fully-initialized const objects be initialized? Unknown
254[basic.lookup.elab] CD1 Definitional problems with elaborated-type-specifiers Clang 2.9
255[class.free] CD6 Placement deallocation functions and lookup ambiguity Clang 2.7
256[expr.new] CD1 Overflow in size calculations Duplicate of 624
257[class.base.init] CD2 Abstract base constructors and virtual base initialization Clang 3.4
258[namespace.udecl] CD1 using-declarations and cv-qualifiers Clang 2.8
259[temp.spec] CD1 Restrictions on explicit specialization and instantiation Clang 4
260[over.built] open User-defined conversions and built-in operator= Not resolved
261[basic.def.odr] CD1 When is a deallocation function "used?" No
262[dcl.fct] CD1 Default arguments and ellipsis Clang 2.7
263[class.ctor] CD1 Can a constructor be declared a friend? Clang 3.3
264[temp.arg.explicit] open Unusable template constructors and conversion functions Not resolved
265[expr.delete] dup Destructors, exceptions, and deallocation Duplicate of 353
266[gram] NAD No grammar sentence symbol N/A
267[expr.new] open Alignment requirement for new-expressions Not resolved
268[cpp.rescan] open Macro name suppression in rescanned replacement text Not resolved
269[basic.start.static] NADOrder of initialization of multiply-defined static data members -of class templatesOrder of initialization of multiply-defined static data members of class templates N/A
270[basic.start.static] CD1 Order of initialization of static data members of class templates N/A
271[temp.deduct] CD6 Explicit instantiation and template argument deduction Unknown
272[class.dtor] CD1 Explicit destructor invocation and qualified-ids Clang 2.7
273[class] CD1 POD classes and operator&() Clang 2.7
274[basic.life] CD1 Cv-qualification and char-alias access to out-of-lifetime objects N/A
275[temp.expl.spec] CD1 Explicit instantiation/specialization and using-directives No
276[stmt.jump] CD1 Order of destruction of parameters and temporaries N/A
277[dcl.init] CD1 Zero-initialization of pointers Clang 3.1
278[basic.link] NAD External linkage and nameless entities Unknown
279[basic.link] CD6 Correspondence of "names for linkage purposes" No
280[over.call.object] CD1 Access and surrogate call functions Clang 2.9
281[dcl.fct.spec] CD1 inline specifier in friend declarations No
282[expr.typeid] open Namespace for extended_type_info Not resolved
283[dcl.type.simple] CD1 Template type-parameters are not syntactically type-names Clang 2.7
284[class] CD1 qualified-ids in class declarations No
285[temp.expl.spec] NAD Identifying a function template being specialized Clang 2.7
286[temp.spec.partial] CD1 Incorrect example in partial specialization Clang 2.8
287[temp.point] drafting Order dependencies in template instantiation Not resolved
288[expr.delete] CD1 Misuse of "static type" in describing pointers N/A
289[basic.def.odr] CD1 Incomplete list of contexts requiring a complete type Clang 2.7
290[basic.types] NAD Should memcpy be allowed into a POD with a const member? N/A
291[dcl.init.ref] CD1 Overload resolution needed when binding reference to class rvalue Duplicate of 391
292[expr.new] CD3 Deallocation on exception in new before arguments evaluated Clang 2.9
293[temp.explicit] open Syntax of explicit instantiation/specialization too permissive Not resolved
294[expr.static.cast] NAD Can static_cast drop exception specifications? No
295[dcl.fct] CD1 cv-qualifiers on function types Clang 3.7
296[class.conv.fct] CD1 Can conversion functions be static? Clang 2.7
297[temp.deduct] NAD Which template does an explicit specialization specialize? Unknown
298[class.qual] CD1 T::x when T is cv-qualified Clang 3.1
299[expr.new] CD1 Conversion on array bound expression in new Clang 2.8 (C++11 onwards)
300[temp.deduct.type] CD1 References to functions in template argument deduction Clang 2.7
301[temp.names] CD1 Syntax for template-name Clang 3.5
302[dcl.init] CD1 Value-initialization and generation of default constructor Clang 3.0
303[conv.prom] NAD Integral promotions on bit-fields N/A
304[dcl.init] TC1 Value-initialization of a reference Clang 2.9
305[basic.lookup.classref] CD1 Name lookup in destructor call No
306[class.member.lookup] CD1 Ambiguity by class name injection Duplicate of 39
307[class.cdtor] NAD Initialization of a virtual base class subobject N/A
308[except.handle] NAD Catching exceptions with ambiguous base classes Clang 3.7
309[basic.pre] CD1 Linkage of entities whose names are not simply identifiers, in introduction Duplicate of 485
310[temp.over.link] open Can function templates differing only in parameter cv-qualifiers be overloaded? Not resolved
311[namespace.def] NAD Using qualified name to reopen nested namespace Clang 3.0
312[basic.stc.dynamic.deallocation] CD3 “use” of invalid pointer value not defined Duplicate of 616
313[expr.new] dup Class with single conversion function to integral as array size in new Duplicate of 299 (C++11 onwards)
314[temp.names] C++17 template in base class specifier No
315[class.static.mfct] NAD Is call of static member function through null pointer undefined? N/A
316[temp.local] NAD Injected-class-name of template used as template template parameter Superseded by 1004
317[dcl.fct.spec] CD1 Can a function be declared inline after it has been called? Clang 3.5
318[class.qual] CD1 struct A::A should not name the constructor of A Superseded by 1310
319[basic.link] CD1 Use of names without linkage in declaring entities with linkage No
320[class.temporary] CD1 Question on copy constructor elision example Clang 3.1
321[basic.lookup.argdep] dup Associated classes and namespaces for argument-dependent lookup Duplicate of 557
322[temp.deduct.conv] CD1 Deduction of reference conversions Clang 2.8
323[temp] CD1 Where must export appear? Superseded by 820
324[expr.unary.op] CD1 Can "&" be applied to assignment to bit-field? Clang 3.6
325[dcl.fct.default] open When are default arguments parsed? Not resolved
326[class.ctor] CD1 Wording for definition of trivial constructor Clang 3.1
327[class] CD1 Use of "structure" without definition Duplicate of 538
328[class.mem] CD1 Missing requirement that class member types be complete Clang 2.7
329[temp.friend] CD1 Evaluation of friends of templates Clang 3.5
330[conv.qual] CD4 Qualification conversions and pointers to arrays of pointers Clang 7
331[class.ctor] CD1 Allowed copy constructor signatures Clang 11
332[dcl.fct] CD3 cv-qualified void parameter types Duplicate of 577
333[dcl.ambig.res] NAD Ambiguous use of "declaration" in disambiguation section Clang 2.7
334[temp.dep.expr] NAD Is a comma-expression dependent if its first operand is? Clang 2.7
335[temp] CD1 Allowing export on template members of nontemplate classes Superseded by 820
336[temp.expl.spec] CD1 Explicit specialization examples are still incorrect Clang 2.7
337[temp.deduct] CD1 Attempt to create array of abtract type should cause deduction to fail Clang 2.7
338[basic.link] CD6 Enumerator name with linkage used as class name in other translation unit Duplicate of 1884
339[expr.const] CD1 Overload resolution in operand of sizeof in constant expression Clang 2.8
340[dcl.ambig.res] NAD Unclear wording in disambiguation section Clang 2.7
341[dcl.link] C++11 extern "C" namespace member function versus global variable Superseded by 1708
342[expr.unary] CD3 Terminology: "indirection" versus "dereference" N/A
343[temp.names] C++17 Make template optional in contexts that require a type No
344[class.dtor] CD3 Naming destructors Duplicate of 1435
345[temp.res] CD1 Misleading comment on example in templates chapter Clang 2.7
346[except.spec] NAD Typo in 15.4 N/A
347[class.nest] NAD Use of derived class name in defining base class nested class Clang 2.7
348[basic.stc.dynamic.deallocation] CD1 delete and user-written deallocation functions N/A
349[temp.deduct.conv] CD1 Template argument deduction for conversion functions and qualification conversions No
350[basic.types] open signed char underlying representation for objects Not resolved
351[expr] CD1 Sequence point error: unspecified or undefined? N/A
352[temp.deduct.call] CD1 Nondeduced contexts Clang 2.8
353[expr.delete] CD1 Is deallocation routine called if destructor throws exception in delete? Unknown
354[temp.arg.nontype] CD1 Null as nontype template argument Clang 3.1 (C++11 onwards)
355[class] C++11 Global-scope :: in nested-name-specifier Clang 2.7
356[class.copy.ctor] NAD Wording of behavior of generated copy constructor for scalar members N/A
357[intro.defs] CD1 Definition of signature should include name Clang 2.7
358[dcl.link] NAD Namespaces and extern "C" Clang 2.7
359[class.union] NAD Type definition in anonymous union Clang 3.3
360[class.access.base] CD6 Using-declaration that reduces access Clang 2.8
361[dcl.fct.default] open Forward reference to default argument Not resolved
362[lex.phases] CD1 Order of initialization in instantiation units N/A
363[class.expl.init] NAD Initialization of class from self N/A
364[over.call.func] CD1 Calling overloaded function with static in set, with no object Clang 2.7
365[basic.stc] open Storage duration and temporaries Not resolved
366[expr.const] CD1 String literal allowed in integral constant expression? Clang 2.7
367[expr.const] CD1 throw operator allowed in constant expression? Clang 2.7
368[temp.deduct] CD1 Uses of non-type parameters that should cause deduction to fail Clang 3.6
369[lex.pptoken] open Are new/delete identifiers or preprocessing-op-or-punc? Not resolved
370[cpp.include] CD1 Can #include <...> form be used other than for standard C++ headers? N/A
371[basic.start.static] open Interleaving of constructor calls Not resolved
372[temp.arg] CD1 Is access granted by base class specifiers available in following base class specifiers? No
373[basic.lookup.udir] C++11 Lookup on namespace qualified name in using-directive Clang 5
374[dcl.meaning] CD2 Can explicit specialization outside namespace use qualified name? Clang 7
375[temp.res] dup Confusing example on lookup with typename Duplicate of 345
376[dcl.fct.spec] NAD Class "definition" versus class "declaration" N/A
377[dcl.enum] CD1 Enum whose enumerators will not fit in any integral type Clang 2.7
378[stmt.jump] CD1 Wording that says temporaries are declared Duplicate of 276
379[class] CD1 Change "class declaration" to "class definition" N/A
380[class.member.lookup] open Definition of "ambiguous base class" missing Not resolved
381[basic.lookup.classref] CD1 Incorrect example of base class member lookup Clang 2.7
382[temp.res] CD1 Allow typename outside of templates Clang 2.7 (C++11 onwards)
383[class] CD1 Is a class with a declared but not defined destructor a POD? Clang 2.7
384[basic.lookup.argdep] NAD Argument-dependent lookup and operator functions Clang 2.7
385[class.protected] CD1 How does protected member check of 11.5 interact with using-declarations? Clang 2.8
386[namespace.udecl] CD6 Friend declaration of name brought in by using-declaration No
387[temp.inject] CD1 Errors in example in 14.6.5 Clang 2.8
388[except.handle] CD3 Catching base*& from a throw of derived* Unknown
389[basic.link] CD1 Unnamed types in entities with linkage No
390[class.abstract] CD1 Pure virtual must be defined when implicitly called Clang 3.3
391[dcl.init.ref] CD1 Require direct binding of short-lived references to rvalues Clang 2.8 (C++11 onwards)
392[class.temporary] CD1 Use of full expression lvalue before temporary destruction Clang 2.8
393[dcl.fct] CD4 Pointer to array of unknown bound in template argument list in parameter Clang 2.7
394[cpp.pre] CD1 identifier-list is never defined N/A
395[class.conv.fct] NAD Conversion operator template syntax Clang 3.0
396[dcl.fct.spec] CD1 Misleading note regarding use of auto for disambiguation Clang 3.0
397[dcl.fct.spec] CD1 Same address for string literals from default arguments in inline functions? Superseded by 1823
398[temp.deduct] CD1 Ambiguous wording on naming a type in deduction Clang 2.7
399[class.dtor] CD6 Destructor lookup redux Clang 11
400[namespace.qual] CD1 Using-declarations and the "struct hack" Clang 2.7
401[temp.param] CD1 When is access for template parameter default arguments checked? Clang 2.8
402[temp.func.order] open More on partial ordering of function templates Not resolved
403[basic.lookup.argdep] CD1 Reference to a type as a template-id Clang 2.7
404[basic.life] CD1 Unclear reference to construction with non-trivial constructor N/A
405[basic.lookup.unqual] CD6 Unqualified function name lookup Clang 2.7
406[class.static.data] CD1 Static data member in class with name for linkage purposes Clang 2.9
407[dcl.typedef] C++11 Named class with associated typedef: two names or one? Clang 3.8
408[temp.static] CD2 sizeof applied to unknown-bound array static data member of template Clang 3.4
409[temp.res] CD1 Obsolete paragraph missed by changes for issue 224 Clang 2.7
410[temp.friend] CD1 Paragraph missed in changes for issue 166 No
411[lex.string] CD6 Use of universal-character-name in character versus string literals Unknown
412[dcl.fct.spec] NAD Can a replacement allocation function be inline? Clang 3.4
413[class] CD1 Definition of "empty class" Clang 2.7
414[basic.lookup.classref] CD1 Multiple types found on destructor lookup Duplicate of 305
415[temp.over] CD1 Template deduction does not cause instantiation Clang 2.7
416[over.match.oper] CD1 Class must be complete to allow operator lookup? Clang 2.7
417[class.name] CD1 Using derived-class qualified name in out-of-class nested class definition No
418[over.match.best] CD6 Imperfect wording on error on multiple default arguments on a called function No
419[basic.life] open Can cast to virtual base class be done on partially-constructed object? Not resolved
420[over.ref] CD1 postfixexpression->scalar_type_dtor() inconsistent Clang 9
421[expr.ref] CD1 Is rvalue.field an rvalue? Clang 2.7
422[dcl.typedef] NAD Is a typedef redeclaration allowed with a template type that might be the same? Clang 2.7
423[over.match.oper] NAD Can a conversion be done on the left operand of a compound assignment? Clang 2.7
424[dcl.typedef] CD1 Wording problem with issue 56 resolution on redeclaring typedefs in class scope Clang 2.7
425[over.built] CD1 Set of candidates for overloaded built-in operator with float operand Clang 2.7
426[basic.link] C++17 Identically-named variables, one internally and one externally linked, allowed? Unknown
427[expr.static.cast] CD1 static_cast ambiguity: conversion versus cast to derived Clang 2.7
428[except.throw] CD1 Mention of expression with reference type Clang 2.7
429[expr.new] CD1 Matching deallocation function chosen based on syntax or signature? Clang 2.8 (C++11 onwards)
430[dcl.init.aggr] CD1 Ordering of expression evaluation in initializer list Clang 2.7 (C++11 onwards)
431[temp.names] C++11 Defect in wording in 14.2 Clang 2.8
432[basic.scope.class] CD1 Is injected class name visible in base class specifier list? Clang 3.0
433[basic.scope.pdecl] CD1 Do elaborated type specifiers in templates inject into enclosing namespace scope? Clang 2.7
434[dcl.init.ref] NAD Unclear suppression of standard conversions while binding reference to lvalue Superseded by 2352
435[dcl.pre] NAD Change "declararation or definition" to "declaration" N/A
436[class.bit] CD1 Problem in example in 9.6 paragraph 4 Clang 2.7
437[class.mem] CD1 Is type of class allowed in member function exception specification? Superseded by 1308
438[expr] CD2 Possible flaw in wording for multiple accesses to object between sequence points Clang 2.7
439[expr.static.cast] CD1 Guarantees on casting pointer back to cv-qualified version of original type Clang 2.7
440[temp.arg] NAD Allow implicit pointer-to-member conversion on nontype template argument Unknown
441[basic.start.static] CD1 Ordering of static reference initialization Clang 2.7
442[expr.delete] CD1 Incorrect use of null pointer constant in description of delete operator Superseded by 348
443[class.temporary] CD1 Wording nit in description of lifetime of temporaries N/A
444[class.copy.assign] NAD Overriding and the generated copy assignment operator Clang 2.7
445[class.friend] NAD Wording issue on friend declarations Clang 3.2
446[expr.cond] CD1 Does an lvalue-to-rvalue conversion on the "?" operator produce a temporary? Clang 2.8
447[temp.dep.constexpr] CD1 Is offsetof type-dependent? Clang 2.8
448[temp.local] C++11 Set of template functions in call with dependent explicit argument Clang 2.8
449[intro.defs] NAD Consistency in use of hyphen with names of "non" entities N/A
450[dcl.init.ref] CD1 Binding a reference to const to a cv-qualified array rvalue Clang 3.2
451[expr] CD1 Expressions with invalid results and ill-formedness Clang 2.7
452[class.this] CD1 Wording nit on description of this Clang 2.7
453[dcl.ref] CD7 References may only bind to “valid” objects Unknown
454[class.static.data] CD1 When is a definition of a static data member required? Unknown
455[over.match.best] NAD Partial ordering and non-deduced arguments Unknown
456[conv.ptr] NAD Is initialized const int or const bool variable a null pointer constant? Clang 3.4
457[expr.const] CD1 Wording nit on use of const variables in constant expressions Clang 2.7
458[temp.local] C++11 Hiding of member template parameters by other members Clang 11
459[temp.local] NAD Hiding of template parameters by base class members Unknown
460[namespace.udecl] CD1 Can a using-declaration name a namespace? Clang 2.7
461[dcl.asm] NAD Make asm conditionally-supported N/A
462[class.temporary] CD3 Lifetime of temporaries bound to comma expressions Clang 2.7
463[expr.reinterpret.cast] CD1 reinterpret_cast<T*>(0) N/A
464[class.temporary] CD1 Wording nit on lifetime of temporaries to which references are bound N/A
465[basic.start.static] NAD May constructors of global objects call exit()? N/A
466[expr.pseudo] CD1 cv-qualifiers on pseudo-destructor type Clang 2.8
467[stmt.dcl] NAD Jump past initialization of local static variable Clang 2.7
468[temp.names] CD1 Allow ::template outside of templates Clang 2.7 (C++11 onwards)
469[temp.deduct.type] NAD Const template specializations and reference arguments No
470[temp.explicit] CD1 Instantiation of members of an explicitly-instantiated class template Clang 2.7
471[class.access.base] NAD Conflicting inherited access specifications Clang 2.8
472[class.protected] open Casting across protected inheritance @@ -2887,5528 +3358,6447 @@

C++ defect report implementation status

473[expr.new] NAD Block-scope declarations of allocator functions Unknown
474[basic.link] CD1 Block-scope extern declarations in namespace members Clang 3.4
475[except.uncaught] C++11 When is std::uncaught_exception() true? (take 2) Unknown
476[expr.new] CD5 Determining the buffer size for placement new Unknown
477[dcl.fct.spec] CD1 Can virtual appear in a friend declaration? Clang 3.5
478[dcl.array] NAD May a function parameter be an array of an abstract class type? Clang 2.7
479[except.throw] CD1 Copy elision in exception handling Clang 2.8
480[conv.mem] CD1 Is a base of a virtual base also virtual? Clang 2.7
481[basic.scope] CD2 Scope of template parameters Clang 2.8
482[dcl.meaning] CD3 Qualified declarators in redeclarations Clang 3.5
483[basic.fundamental] CD3 Normative requirements on integral ranges Clang 2.7
484[class.derived] CD1 Can a base-specifier name a cv-qualified class type? Clang 2.8
485[basic.pre] CD1 What is a “name”? Clang 2.7
486[temp.deduct] CD1 Invalid return types and template argument deduction Clang 2.7
487[expr.const] NAD Operator overloading in constant expressions Clang 2.7
488[temp.deduct] CD1 Local types, overload resolution, and template argument deduction Clang 2.9 (C++11 onwards)
489[temp.inst] NAD Must member function templates be instantiated during overload resolution? N/A
490[basic.lookup.unqual] CD2 Name lookup in friend declarations Clang 2.8
491[dcl.init.aggr] CD1 Initializers for empty-class aggregrate members Duplicate of 413
492[expr.typeid] CD1 typeid constness inconsistent with example Clang 2.7
493[temp.deduct.conv] CD2 Type deduction from a bool context Duplicate of 976
494[class.access] CD1 Problems with the resolution of issue 45 Duplicate of 372
495[over.match.best] CD2 Overload resolution with template and non-template conversion functions Clang 3.5
496[basic.types] CD3 Is a volatile-qualified type really a POD? Superseded by 2094
497[expr.mptr.oper] CD1 Missing required initialization in example Superseded by 253
498[dcl.stc] open Storage class specifiers in definitions of class members Not resolved
499[except.throw] CD2 Throwing an array of unknown size Clang 2.7
500[class.friend] CD1 Access in base-specifiers of friend and nested classes Duplicate of 372
501[class.friend] NAD Visibility of friend declarations within the befriending class Clang 2.7
502[temp.dep.type] C++11 Dependency of nested enumerations and enumerators Clang 2.7
503[temp.deduct.call] open Cv-qualified function types in template argument deduction Not resolved
504[dcl.ref] NAD Should use of a variable in its own initializer require a diagnostic? Unknown
505[lex.ccon] CD1 Conditionally-supported behavior for unknown character escapes Clang 2.7
506[expr.call] CD1 Conditionally-supported behavior for non-POD objects passed to ellipsis Clang 2.7
507[over.built] dup Ambiguity assigning class object to built-in type Duplicate of 260
508[dcl.init] C++11 Non-constructed value-initialized objects N/A
509[dcl.init] CD1 Dead code in the specification of default initialization N/A
510[class.init] CD1 Default initialization of POD classes? N/A
511[class.prop] NAD POD-structs with template assignment operators Unknown
512[class.union] NAD Union members with user-declared non-default constructors Clang 3.0
513[intro.object] CD1 Non-class “most-derived” objects N/A
514[basic.lookup.unqual] CD1 Is the initializer for a namespace member in the scope of the namespace? Clang 2.7
515[temp.dep] CD1 Non-dependent references to base class members Superseded by 1017
516[dcl.type.simple] CD1 Use of signed in bit-field declarations N/A
517[temp.spec.partial.general] CD1 Partial specialization following explicit instantiation No
518[dcl.enum] CD1 Trailing comma following enumerator-list Clang 2.7 (C++11 onwards)
519[conv.ptr] CD1 Null pointer preservation in void* conversions Clang 2.7
520[expr.cast] CD1 Old-style casts between incomplete class types N/A
521[basic.stc.dynamic.allocation] CD1 Requirements for exceptions thrown by allocation functions No
522[temp.deduct.call] CD1 Array-to-pointer decay in template argument deduction Clang 2.7
523[basic.stc.dynamic.deallocation] open Can a one-past-the-end pointer be invalidated by deleting an adjacent object? Not resolved
524[temp.dep] CD1 Can function-notation calls to operator functions be dependent? Clang 2.7
525[temp.inst] CD1 Missing * in example Clang 2.7
526[temp.deduct.type] CD1 Confusing aspects in the specification of non-deduced contexts Clang 2.7
527[basic.link] CD2 Problems with linkage of types N/A
528[expr.typeid] NAD Why are incomplete class types not allowed with typeid? Clang 2.7
529[temp.expl.spec] open Use of template<> with “explicitly-specialized” class templates Not resolved
530[expr.const] CD1 Nontype template arguments in constant expressions Clang 2.7
531[temp.expl.spec] C++11 Defining members of explicit specializations Partial
532[temp.func.order] C++11 Member/nonmember operator template partial ordering Clang 3.5
533[cpp.include] NAD Special treatment for C-style header names N/A
534[temp] CD1 template-names and operator-function-ids Clang 2.9
535[class.copy.ctor] CD3 Copy construction without a copy constructor Clang 3.1
536[expr.prim.general] CD6 Problems in the description of id-expressions N/A
537[intro.defs] CD1 Definition of “signature” N/A
538[class] CD1Definition and usage -of structure, POD-struct, POD-union, -and POD classDefinition and usage of structure, POD-struct, POD-union, and POD class N/A
539[dcl.type] CD3 Constraints on type-specifier-seq Clang 3.4
540[namespace.def] CD1 Propagation of cv-qualifiers in reference-to-reference collapse Clang 2.7
541[temp.dep.expr] CD2 Dependent function types Clang 2.7
542[class.init] CD2 Value initialization of arrays of POD-structs Clang 3.5
543[dcl.init] CD1 Value initialization and default constructors Clang 3.0
544[temp.dep] NAD Base class lookup in explicit specialization Clang 2.7
545[over.match.oper] open User-defined conversions and built-in operator overload resolution Not resolved
546[temp.explicit] C++11 Explicit instantiation of class template members Clang 2.7
547[dcl.fct] C++11 Partial specialization on member function types Clang 3.2
548[dcl.meaning] dup qualified-ids in declarations Duplicate of 482
549[temp.spec.partial.match] drafting Non-deducible parameters in partial specializations Not resolved
550[dcl.fct] dup Pointer to array of unknown bound in parameter declarations Duplicate of 393
551[temp.explicit] CD1 When is inline permitted in an explicit instantiation? Clang 2.7 (C++11 onwards)
552[temp.names] NAD Use of typename in the type in a non-type parameter-declaration Clang 2.7
553[namespace.memdef] NAD Problems with friend allocation and deallocation functions Clang 2.7
554[basic.scope] CD6 Definition of “declarative region” and “scope” N/A
555[basic.lookup] CD5 Pseudo-destructor name lookup Clang 2.8
556[expr.assign] CD2 Conflicting requirements for acceptable aliasing N/A
557[basic.lookup.argdep] CD1 Does argument-dependent lookup cause template instantiation? Clang 3.1
558[lex.charset] CD1 Excluded characters in universal character names Clang 2.9
559[temp.res] CD1 Editing error in issue 382 resolution Clang 2.7
560[temp.res] NAD Use of the typename keyword in return types Clang 16
561[temp.dep.candidate] CD2 Internal linkage functions in dependent name lookup Clang 2.7
562[class.qual] CD6 qualified-ids in non-expression contexts N/A
563[dcl.link] CD6 Linkage specification for objects Clang 3.3
564[dcl.link] CD2 Agreement of language linkage or linkage-specifications? Clang 2.7
565[namespace.udecl] CD3 Conflict rules for using-declarations naming function templates Clang 2.7
566[conv.fpint] NAD Conversion of negative floating point values to integer type Clang 3.1
567[expr.add] NAD Can size_t and ptrdiff_t be larger than long? N/A
568[class] CD1 Definition of POD is too strict Clang 3.0 (C++11 onwards)
569[dcl.pre] CD2 Spurious semicolons at namespace scope should be allowed Clang 2.7 (C++11 onwards)
570[basic.def.odr] CD2 Are references subject to the ODR? Duplicate of 633
571[basic.link] CD2 References declared const Clang 2.7
572[conv] C++11 Standard conversions for non-built-in types Clang 2.7
573[expr.reinterpret.cast] C++11 Conversions between function pointers and void* No
574[class.copy.assign] NAD Definition of “copy assignment operator” Clang 3.0
575[temp.deduct] C++11 Criteria for deduction failure Clang 2.7
576[dcl.typedef] CD2 Typedefs in function definitions Clang 3.5
577[dcl.fct] CD3 void in an empty parameter list Clang 3.5
578[lex.phases] CD6 Phase 1 replacement of characters with universal-character-names Unknown
579[temp.names] open What is a “nested” > or >>? Not resolved
580[class.access] C++11 Access in template-parameters of member and friend definitions Partial
581[temp.arg.explicit] CD5 Can a templated constructor be explicitly instantiated or specialized? Unknown
582[temp.mem] CD1 Template conversion functions N/A
583[expr.rel] CD3 Relational pointer comparisons against the null pointer constant Clang 4
584[basic.lval] NAD Unions and aliasing N/A
585[class.friend] NAD Friend template template parameters Clang 3.0
586[temp.deduct.type] NAD Default template-arguments and template argument deduction N/A
587[expr.cond] CD2 Lvalue operands of a conditional expression differing only in cv-qualification Clang 3.2
588[temp.dep] CD2 Searching dependent bases of classes local to function templates Clang 2.7
589[dcl.init.ref] CD2 Direct binding of class and array rvalues in reference initialization Clang 2.7
590[temp.dep.type] C++11 Nested classes and the “current instantiation” Clang 2.7
591[temp.dep] CD4 When a dependent base class is the current instantiation Clang 20
592[except.ctor] CD1 Exceptions during construction of local static objects N/A
593[except.handle] NAD Falling off the end of a destructor's function-try-block handler Clang 2.8
594[basic.life] CD1 Coordinating issues 119 and 404 with delegating constructors N/A
595[except.spec] dup Exception specifications in templates instantiated from class bodies Duplicate of 1330
596[except.unexpected] NAD Replacing an exception object Unknown
597[basic.life] CD3 Conversions applied to out-of-lifetime non-POD lvalues N/A
598[basic.lookup.argdep] CD2 Associated namespaces of overloaded functions and function templates Clang 2.7
599[expr.delete] CD2 Deleting a null function pointer Partial
600[class.access] CD6 Does access control apply to members or to names? Clang 2.8
601[cpp.cond] CD2 Type of literals in preprocessing expressions Clang 2.7
602[temp.local] C++11 When is the injected-class-name of a class template a template? Clang 2.7
603[temp.type] CD1 Type equivalence and unsigned overflow Clang 3.1
604[over.match.ctor] CD2 Argument list for overload resolution in copy-initialization N/A
605[temp.expl.spec] C++11 Linkage of explicit specializations Clang 2.7
606[temp.deduct.call] CD1 Template argument deduction for rvalue references Clang 3.0
607[class.base.init] CD6 Lookup of mem-initializer-ids Clang 2.7
608[class.virtual] CD2 Determining the final overrider of a virtual function Clang 2.7
609[dcl.type.cv] CD4 What is a “top-level” cv-qualifier? Unknown
610[expr.unary.op] NAD Computing the negative of 0U Clang 2.7
611[dcl.init] CD2 Zero-initializing references Clang 2.7
612[intro.execution] CD2 Requirements on a conforming implementation N/A
613[class.mem] CD1 Unevaluated uses of non-static class members Clang 3.1 (C++11 onwards)
614[expr.mul] CD1 Results of integer / and % Clang 2.7
615[dcl.init] C++11 Incorrect description of variables that can be initialized Clang 2.7
616[intro.defs] CD3 Definition of “indeterminate value” Clang 4
617[conv.lval] NAD Lvalue-to-rvalue conversions of uninitialized char objects Unknown
618[cpp.cond] CD2 Casts in preprocessor conditional expressions Clang 2.7
619[basic.types] C++11 Completeness of array types Clang 3.4
620[class.mem] CD1 Declaration order in layout-compatible POD structs Duplicate of 568
621[temp.expl.spec] C++11 Template argument deduction from function return types Clang 2.7
622[expr.rel] NAD Relational comparisons of arbitrary pointers Unknown
623[basic.stc.dynamic.deallocation] CD3 Use of pointers to deallocated storage N/A
624[expr.new] CD1 Overflow in calculating size of allocation Unknown
625[dcl.spec.auto] CD2 Use of auto as a template-argument Clang 2.9
626[cpp.stringize] CD2 Preprocessor string literals Clang 2.7
627[basic.fundamental] NAD Values behaving as types Clang 2.7
628[dcl.enum] CD2 The values of an enumeration with no enumerator N/A
629[dcl.spec.auto] CD1 auto parsing ambiguity Clang 2.9
630[lex.charset] CD2 Equality of narrow and wide character values in the basic character set Clang 2.7
631[stmt.if] CD3 Jumping into a “then” clause N/A
632[dcl.init.aggr] CD1 Brace-enclosed initializer for scalar member of aggregate Clang 2.7
633[basic.pre] CD2 Specifications for variables that should also apply to references N/A
634[expr.call] CD1 Conditionally-supported behavior for non-POD objects passed to ellipsis redux Clang 2.7
635[class.qual] NAD Names of constructors and destructors of templates Clang 2.7
636[basic.lval] CD4 Dynamic type of objects and aliasing Unknown
637[intro.execution] CD1 Sequencing rules and example disagree Clang 3.0
638[temp.friend] CD2 Explicit specialization and friendship No
639[intro.execution] CD1 What makes side effects “different” from one another? Clang 3.3
640[basic.start.dynamic] NAD Accessing destroyed local objects of static storage duration Unknown
641[over.match.viable] CD2 Overload resolution and conversion-to-same-type operators Clang 2.7
642[basic.scope.block] CD2 Definition and use of “block scope” and “local scope” Clang 2.7
643[dcl.type.simple] NAD Use of decltype in a class member-specification Clang 3.2
644[basic.types] CD1 Should a trivial class type be a literal type? Partial
645[class.mem] CD2 Are bit-field and non-bit-field members layout compatible? N/A
646[basic.types] NAD Can a class with a constexpr copy constructor be a literal type? Superseded by 981
647[dcl.constexpr] CD1 Non-constexpr instances of constexpr constructor templates Clang 3.1
648[dcl.constexpr] CD1 Constant expressions in constexpr initializers Clang 2.7
649[basic.align] CD1 Optionally ill-formed extended alignment requests Clang 3.5
650[class.temporary] CD2 Order of destruction for temporaries bound to the returned value of a function Clang 2.8
651[dcl.type.simple] CD1 Problems in decltype specification and examples Clang 2.7
652[expr.const] CD2 Compile-time evaluation of floating-point expressions Clang 3.1
653[class.copy.assign] CD2 Copy assignment of unions Clang 2.7
654[conv.ptr] CD1 Conversions to and from nullptr_t Superseded by 1423
655[class.base.init] C++11 Initialization not specified for forwarding constructors Clang 3.0
656[dcl.init.ref] CD2 Direct binding to the result of a conversion operator Clang 2.8
657[temp.deduct] CD2 Abstract class parameter in synthesized declaration Partial
658[expr.reinterpret.cast] CD2 Defining reinterpret_cast for pointer types Clang 2.7
659[expr.alignof] CD1 Alignment of function types Clang 3.0
660[dcl.enum] CD1 Unnamed scoped enumerations Clang 3.0
661[expr.rel] CD1 Semantics of arithmetic comparisons Clang 2.7
662[temp.deduct] NAD Forming a pointer to a reference type Clang 2.7
663[extendid] CD1 Valid Cyrillic identifier characters Superseded by P1949
664[dcl.init.ref] CD2 Direct binding of references to non-class rvalue references Clang 2.7
665[expr.dynamic.cast] CD2 Problems in the specification of dynamic_cast Clang 2.8
666[temp.res] CD1 Dependent qualified-ids without the typename keyword Clang 2.8
667[class.copy.ctor] CD2 Trivial special member functions that cannot be implicitly defined Clang 8
668[except.terminate] CD2 Throwing an exception from the destructor of a local static object Unknown
669[dcl.type.simple] NAD Confusing specification of the meaning of decltype Clang 3.1
670[dcl.init] CD4 Copy initialization via derived-to-base conversion in the second step Unknown
671[expr.static.cast] CD1 Explicit conversion from a scoped enumeration type to integral type Clang 2.9
672[expr.new] CD2 Sequencing of initialization in new-expressions Clang 2.7
673[namespace.memdef] NAD Injection of names from elaborated-type-specifiers in friend declarations Clang 2.7
674[temp.friend] C++11 “matching specialization” for a friend declaration Clang 8
675[class.bit] CD3 Signedness of bit-field with typedef or template parameter type Duplicate of 739
676[basic.def] C++11 static_assert-declarations and general requirements for declarations N/A
677[class.dtor] CD1 Deleted operator delete and virtual destructors No
678[basic.def.odr] C++11 Language linkage of member function parameter types and the ODR Unknown
679[temp.type] CD1 Equivalence of template-ids and operator function templates Clang 2.7
680[class.copy.ctor] CD2 What is a move constructor? N/A
681[dcl.fct] CD1 Restrictions on declarators with late-specified return types Partial
682[basic.lookup.classref] CD5 Missing description of lookup of template aliases Unknown
683[class.copy.ctor] CD1 Requirements for trivial subobject special functions Clang 3.3
684[expr.const] CD1 Constant expressions involving the address of an automatic variable Superseded by 1454
685[conv.prom] CD2 Integral promotion of enumeration ignores fixed underlying type Clang 10
686[dcl.name] CD1 Type declarations/definitions in type-specifier-seqs and type-ids Clang 3.0
687[expr.prim.general] NAD template keyword with unqualified-ids Unknown
688[basic.start.static] CD1 Constexpr constructors and static initialization Unknown
689[basic.fundamental] CD5 Maximum values of signed and unsigned integers Unknown
690[intro.defs] CD2 The dynamic type of an rvalue reference Unknown
691[temp.param] C++11 Template parameter packs in class template partial specializations Unknown
692[temp.deduct.type] C++11 Partial ordering of variadic class template partial specializations Clang 16
693[conv.array] CD2 New string types and deprecated conversion Unknown
694[dcl.init] C++11 Zero- and value-initialization of union objects Unknown
695[expr] CD2 Compile-time calculation errors in constexpr functions Unknown
696[class.local] C++11 Use of block-scope constants in local classes Clang 3.1
697[temp.deduct] open Deduction rules apply to more than functions Not resolved
698[intro.execution] open The definition of “sequenced before” is too narrow Not resolved
699[dcl.constexpr] CD2 Must constexpr member functions be defined in the class member-specification? Unknown
700[dcl.constexpr] C++11 Constexpr member functions of class templates Unknown
701[dcl.array] CD2 When is the array-to-pointer conversion applied? Unknown
702[over.ics.rank] CD2 Preferring conversion to std::initializer_list Unknown
703[dcl.init.list] CD2 Narrowing for literals that cannot be exactly represented Unknown
704[over.match.call] CD2 To which postfix-expressions does overload resolution apply? Unknown
705[basic.lookup.argdep] CD2 Suppressing argument-dependent lookup via parentheses Clang 2.7
706[dcl.spec.auto] NAD Use of auto with rvalue references Unknown
707[conv.fpint] CD2 Undefined behavior in integral-to-floating conversions Unknown
708[temp.spec.partial] open Partial specialization of member templates of class templates Not resolved
709[temp.deduct] C++11 Enumeration names as nested-name-specifiers in deduction failure Unknown
710[class.cdtor] CD2 Data races during construction Unknown
711[dcl.spec.auto] CD2 auto with braced-init-list Unknown
712[basic.def.odr] CD3 Are integer constant operands of a conditional-expression “used?” Partial
713[dcl.fct] CD2 Unclear note about cv-qualified function types Clang 3.0
714[class.static.data] CD2 Static const data members and braced-init-lists Unknown
715[expr.const] CD2 Class member access constant expressions Unknown
716[class.union] CD2 Specifications that should apply only to non-static union data members Unknown
717[dcl.stc] CD2 Unintentional restrictions on the use of thread_local Unknown
718[class.friend] NAD Non-class, non-function friend declarations Unknown
719[basic.pre] CD2 Specifications for operator-function-id that should also apply to literal-operator-id Unknown
720[expr.prim.lambda] CD2 Need examples of lambda-expressions Unknown
721[expr.const] CD2 Where must a variable be initialized to be used in a constant expression? Unknown
722[expr.call] CD2 Can nullptr be passed to an ellipsis? Clang 20
726[intro.multithread] CD2 Atomic and non-atomic objects in the memory model Unknown
727[temp.expl.spec] C++17 In-class explicit specializations Partial
728[temp] NAD Restrictions on local classes Unknown
729[except.handle] CD3 Qualification conversions and handlers of reference-to-pointer type Unknown
730[temp.expl.spec] CD2 Explicit specializations of members of non-template classes Unknown
731[expr.ref] CD2 Omitted reference qualification of member function type Unknown
732[dcl.fct.def] CD2 Late-specified return types in function definitions Unknown
733[class.copy.assign] NAD Reference qualification of copy assignment operators Unknown
734[expr.reinterpret.cast] CD2 Are unique addresses required for namespace-scope variables? Unknown
735[basic.stc.dynamic.safety] CD2 Missing case in specification of safely-derived pointers Unknown
736[dcl.decl] NAD Is the & ref-qualifier needed? Unknown
737[dcl.init.string] CD2 Uninitialized trailing characters in string initialization Unknown
738[class.ctor] C++11 constexpr not permitted by the syntax of constructor declarations Unknown
739[class.bit] CD3 Signedness of plain bit-fields Unknown
740[intro.multithread] CD2 Incorrect note on data races Unknown
741[class.bit] C++11 “plain” long long bit-fields Unknown
742[expr.post.incr] open Postfix increment/decrement with long bit-field operands Not resolved
743[expr.prim.general] CD2 Use of decltype in a nested-name-specifier Unknown
744[temp.arg.template] CD2 Matching template arguments with template template parameters with parameter packs Unknown
745[cpp.error] C++23 Effect of ill-formedness resulting from #error Unknown
746[dcl.spec.auto] CD2 Use of auto in new-expressions Unknown
747[class.access.base] dup Access of protected base classes Unknown
749[over.built] CD2 References to function types with a cv-qualifier or ref-qualifier Unknown
750[expr.prim.lambda.closure] CD2 Implementation constraints on reference-only closure objects Unknown
751[expr.prim.lambda.closure] CD2 Deriving from closure classes Unknown
752[expr.prim.lambda] CD2 Name lookup in nested lambda-expressions Unknown
753[expr.prim.lambda.capture] CD2 Array names in lambda capture sets Unknown
754[expr.prim.lambda] CD2 Lambda expressions in default arguments of block-scope function declarations Unknown
755[expr.prim.lambda.capture] CD3 Generalized lambda-captures Unknown
756[expr.prim.lambda.closure] CD2 Dropping cv-qualification on members of closure objects Unknown
757[basic.link] CD2 Types without linkage in declarations Unknown
758[basic.def] C++11 Missing cases of declarations that are not definitions Unknown
759[expr.prim.lambda.closure] CD2 Destruction of closure objects Unknown
760[expr.prim.general] CD2 this inside a nested class of a non-static member function Unknown
761[expr.prim.lambda.closure] CD2 Inferred return type of closure object call operator Unknown
762[expr.prim.lambda] CD2 Name lookup in the compound-statement of a lambda expression Unknown
763[expr.prim.lambda.closure] CD2 Is a closure object's operator() inline? Unknown
764[expr.prim.lambda.capture] CD2 Capturing unused variables in a lambda expression Unknown
765[dcl.fct.spec] CD2 Local types in inline functions with external linkage Unknown
766[expr.prim.lambda] CD2 Where may lambda expressions appear? Unknown
767[expr.prim.lambda] CD2 void and other unnamed lambda-parameters Unknown
768[expr.prim.lambda] CD2 Ellipsis in a lambda parameter list Unknown
769[expr.prim.lambda] CD2 Initialization of closure objects Unknown
770[dcl.decl] CD2 Ambiguity in late-specified return type Unknown
771[expr.prim.lambda.closure] CD2 Move-construction of reference members of closure objects Unknown
772[expr.prim.lambda.capture] CD2 capture-default in lambdas in local default arguments Unknown
773[temp.arg.nontype] C++11 Parentheses in address non-type template arguments Unknown
774[expr.prim.lambda.closure] CD2 Can a closure class be a POD? Unknown
775[expr.prim.lambda.capture] CD2 Capturing references to functions Unknown
776[basic.start.dynamic] CD2 Delegating constructors, destructors, and std::exit Unknown
777[dcl.fct.default] CD2 Default arguments and parameter packs Clang 3.7
778[temp.param] C++11 Template parameter packs in non-type template parameters Unknown
779[expr.prim.lambda.closure] CD2 Rvalue reference members of closure objects? Unknown
782[expr.prim.lambda] CD2 Lambda expressions and argument-dependent lookup Unknown
783[intro.defs] open Definition of “argument” Not resolved
784[intro.structure] C++11 List of incompatibilities with the previous Standard Unknown
785[intro.execution] CD2 “Execution sequence” is inappropriate phraseology Unknown
786[intro.multithread] CD2 Definition of “thread” Unknown
787[lex.phases] CD2 Unnecessary lexical undefined behavior Clang 21
788[lex.charset] CD2 Relationship between locale and values of the execution character set Unknown
789[lex.trigraph] CD2 Deprecating trigraphs Unknown
790[lex.string] CD2 Concatenation of raw and non-raw string literals Unknown
792[basic.start.main] CD2 Effects of std::quick_exit Unknown
793[basic.life] CD2 Use of class members during destruction Unknown
794[conv.mem] NAD Base-derived conversion in member type of pointer-to-member conversion Clang 2.7
795[expr.prim.lambda] NAD Dependency of lambdas on <functional> Unknown
796[expr.prim.lambda] CD2 Lifetime of a closure object with members captured by reference Unknown
797[expr.prim.lambda.closure] CD2 Converting a no-capture lambda to a function type Unknown
798[expr.sub] C++11 Overloaded subscript operator described in clause 5 Unknown
799[expr.reinterpret.cast] CD2 Can reinterpret_cast be used to cast an operand to its own type? Unknown
800[expr.reinterpret.cast] NAD Safely-derived pointers and object pointers converted from function pointers Unknown
801[expr.const.cast] CD2 Casting away constness in a cast to rvalue reference type Unknown
803[expr.sizeof] CD2 sizeof an enumeration type with a fixed underlying type Unknown
804[expr.new] CD2 Deducing the type in new auto(x) Unknown
805[expr.new] CD2 Which exception to throw for overflow in array size calculation Unknown
806[expr.const] CD2 Enumeration types in integral constant expressions Unknown
807[expr.const] NAD typeid expressions in constant expressions Unknown
808[dcl.spec] CD2 Non-type decl-specifiers versus max-munch Unknown
809[dcl.stc] CD2 Deprecation of the register keyword Unknown
810[dcl.stc] CD2 Block-scope thread_local variables should be implicitly static Unknown
811[dcl.type.cv] CD2 Unclear implications of const-qualification Unknown
812[namespace.def] CD2 Duplicate names in inline namespaces Unknown
813[namespace.udecl] open typename in a using-declaration with a non-dependent name Not resolved
814[dcl.attr] CD2 Attribute to indicate that a function throws nothing Unknown
815[dcl.attr.grammar] CD2 Parameter pack expansion inside attributes Unknown
816[dcl.attr.final] CD2 Diagnosing violations of [[final]] Unknown
817[dcl.attr.final] CD2 Meaning of [[final]] applied to a class definition Unknown
818[dcl.fct] CD2 Function parameter packs in non-final positions Unknown
819[special] NAD Access control and deleted implicitly-declared special member functions Unknown
820[temp] CD2 Deprecation of export Clang 2.7
822[temp] NAD Additional contexts for template aliases Unknown
823[temp.arg.nontype] CD2 Literal types with constexpr conversions as non-type template arguments Unknown
828[except.throw] CD2 Destruction of exception objects Unknown
829[except.spec] NAD At what point is std::unexpected called? Unknown
830[except.spec] CD2 Deprecating exception specifications Unknown
831[implimits] CD2 Limit on recursively nested template instantiations Unknown
832[lex.ppnumber] CD2 Value of preprocessing numbers Unknown
833[expr.static.cast] CD2 Explicit conversion of a scoped enumeration value to a floating type Unknown
834[lex.string] CD2 What is an “ordinary string literal”? Unknown
835[expr] CD2 Scoped enumerations and the “usual arithmetic conversions” Unknown
836[dcl.attr.noreturn] NAD [[noreturn]] applied to function types Unknown
837[dcl.constexpr] C++11 Constexpr functions and return braced-init-list Unknown
838[class.base.init] C++11 Use of this in a brace-or-equal-initializer Unknown
839[expr.sizeof] dup sizeof with opaque enumerations Unknown
840[temp.param] CD2 Rvalue references as nontype template parameters Unknown
842[expr.reinterpret.cast] CD2 Casting to rvalue reference type Unknown
845[dcl.fct.def] CD2 What is the “first declaration” of an explicit specialization? Unknown
846[basic.lval] CD2 Rvalue references to functions Unknown
847[temp.deduct.call] CD2 Error in rvalue reference deduction example Unknown
850[expr.prim.general] CD2 Restrictions on use of non-static data members Unknown
852[namespace.udecl] CD6 using-declarations and dependent base classes Unknown
853[basic.stc.dynamic.safety] CD2 Support for relaxed pointer safety Unknown
854[expr.shift] CD2 Left shift and unsigned extended types Unknown
855[expr.assign] CD2 Incorrect comments in braced-init-list assignment example Unknown
858[expr] CD2 Example binding an rvalue reference to an lvalue Unknown
860[dcl.constexpr] C++11 Explicit qualification of constexpr member functions Unknown
861[namespace.qual] CD2 Unintended ambiguity in inline namespace lookup Unknown
862[dcl.enum] CD2 Undefined behavior with enumerator value overflow Unknown
863[expr.post] CD2 Rvalue reference cast to incomplete type Unknown
864[stmt.ranged] C++11 braced-init-list in the range-based for statement Unknown
865[dcl.init.list] CD2 Initializing a std::initializer_list Unknown
869[dcl.init] CD2 Uninitialized thread_local objects Unknown
872[lex.string] CD2 Lexical issues with raw strings Unknown
873[temp.deduct.type] C++11 Deducing rvalue references in declarative contexts Clang 3.0
874[class.mem] CD2 Class-scope definitions of enumeration types Unknown
876[temp.deduct.call] CD2 Type references in rvalue reference deduction specification Unknown
877[over.match.viable] CD2 Viable functions and binding references to rvalues Unknown
879[over.built] CD2 Missing built-in comparison operators for pointer types Unknown
880[over.built] CD2 Built-in conditional operator for scoped enumerations Unknown
882[basic.start.main] CD2 Defining main as deleted Clang 3.5
883[basic.types] CD2 std::memcpy vs std::memmove Unknown
884[temp.expl.spec] CD2 Defining an explicitly-specialized static data member Unknown
885[temp.deduct.partial] NAD Partial ordering of function templates with unordered parameter pairs Unknown
886[dcl.init.aggr] CD2 Member initializers and aggregates Unknown
887[class.copy.ctor] CD2 Move construction of thrown object Unknown
888[class.base.init] CD2 Union member initializers Unknown
891[expr.const.cast] CD2 const_cast to rvalue reference from objectless rvalue Unknown
892[dcl.constexpr] C++11 Missing requirements for constexpr constructors Unknown
893[dcl.enum] NAD Brace syntax for enumerator-definitions Unknown
896[dcl.init.ref] CD2 Rvalue references and rvalue-reference conversion functions Unknown
897[cpp.pragma.op] open _Pragma and extended string-literals Not resolved
898[dcl.constexpr] C++11 Declarations in constexpr functions Unknown
899[over.match.copy] CD2 Explicit conversion functions in direct class initialization Unknown
900[class.temporary] C++23 Lifetime of temporaries in range-based for Unknown
901[expr.new] open Deleted operator delete Not resolved
902[class.static.data] NAD In-class initialization of non-constant static data members Unknown
903[temp.dep.constexpr] CD3 Value-dependent integral null pointer constants Unknown
904[expr.prim.lambda.capture] CD2 Parameter packs in lambda-captures Unknown
905[class] CD2 Explicit defaulted copy constructors and trivial copyability Unknown
906[dcl.fct.def] CD2 Which special member functions can be defaulted? Unknown
908[dcl.fct.def] CD2 Deleted global allocation and deallocation functions Unknown
909[expr.cast] NAD Old-style casts with conversion functions Unknown
910[class.copy.ctor] CD2 Move constructors and implicitly-declared copy constructors Unknown
912[lex.ccon] CD3 Character literals and universal-character-names Unknown
913[temp.deduct.conv] CD2 Deduction rules for array- and function-type conversion functions Unknown
914[expr.type.conv] open Value-initialization of array types Not resolved
915[dcl.fct.def] CD2 Deleted specializations of member function templates Unknown
919[namespace.def] CD2 Contradictions regarding inline namespaces Unknown
920[dcl.meaning] CD2 Interaction of inline namespaces and using-declarations Unknown
921[namespace.def] CD2 Unclear specification of inline namespaces Unknown
922[class.ctor] CD2 Implicit default constructor definitions and const variant members Unknown
923[temp.expl.spec] CD2 Inline explicit specializations Unknown
924[class.mem] C++11 alias-declaration as a class member Unknown
925[cpp.cond] open Type of character literals in preprocessor expressions Not resolved
926[namespace.unnamed] CD2 Inline unnamed namespaces Unknown
927[class.ctor] CD2 Implicitly-deleted default constructors and member initializers Unknown
928[dcl.fct.def] CD2 Defaulting a function that would be implicitly defined as deleted Unknown
929[temp.alias] CD2 What is a template alias? Unknown
930[expr.alignof] CD2 alignof with incomplete array type Clang 2.7
931[lex.ext] CD2 Confusing reference to the length of a user-defined string literal Unknown
932[lex.string] CD2 UCNs in closing delimiters of raw string literals Unknown
933[lex.ccon] CD2 32-bit UCNs with 16-bit wchar_t Unknown
934[dcl.init.list] CD2 List-initialization of references Unknown
935[over.literal] CD2 Missing overloads for character types for user-defined literals Unknown
936[dcl.init.string] CD2 Array initialization with new string literals Unknown
937[lex.ext] NAD Restrictions on values of template arguments in user-defined literals Unknown
938[dcl.init.aggr] C++11 Initializer lists and array new Unknown
939[class.virtual] CD2 Explicitly checking virtual function overriding Unknown
940[dcl.stc] CD2 Global anonymous unions Unknown
941[temp.expl.spec] C++11 Explicit specialization of deleted function template Unknown
942[basic.pre] CD2 Is this an entity? Unknown
943[expr.type.conv] CD5 Is T() a temporary? Unknown
944[expr.reinterpret.cast] NAD reinterpret_cast for all types with the same size and alignment Unknown
945[expr.prim.general] C++11 Use of this in a late-specified return type Unknown
946[basic.start.dynamic] CD2 Order of destruction of local static objects and calls to std::atexit Unknown
947[temp.over] NAD Deducing type template arguments from default function arguments Unknown
948[stmt.select] C++11 constexpr in conditions Clang 3.7
949[intro.compliance] open Requirements for freestanding implementations Not resolved
950[dcl.type.simple] CD2 Use of decltype as a class-name Unknown
951[dcl.attr] CD2 Problems with attribute-specifiers Unknown
952[class.access.base] CD6 Insufficient description of “naming class” Clang 2.8
953[over.ics.ref] CD2 Rvalue references and function viability Unknown
954[over.built] open Overload resolution of conversion operator templates with built-in types Not resolved
955[expr.prim.lambda.closure] CD2 Can a closure type's operator() be virtual? Unknown
956[dcl.fct] CD2 Function prototype scope with late-specified return types Unknown
957[dcl.attr.grammar] CD2 Alternative tokens and attribute-tokens Unknown
958[expr.prim.lambda] NAD Lambdas and decltype Unknown
959[dcl.align] CD2 Alignment attribute for class and enumeration types Unknown
960[class.virtual] CD2 Covariant functions and lvalue/rvalue references Clang 3.0
961[over.ics.rank] CD2 Overload resolution and conversion of std::nullptr_t to bool Unknown
962[dcl.type.elab] CD2 Attributes appertaining to class and enum types Unknown
963[expr.rel] CD2 Comparing nullptr with 0 Unknown
964[basic.lval] C++11 Incorrect description of when the lvalue-to-rvalue conversion applies Unknown
965[dcl.attr.depend] CD2 Limiting the applicability of the carries_dependency attribute Unknown
966[basic.link] CD2 Nested types without linkage Unknown
967[basic.stc.dynamic] NAD Exception specification of replacement allocation function Unknown
968[dcl.attr.grammar] CD2 Syntactic ambiguity of the attribute notation Unknown
969[temp.explicit] CD2 Explicit instantiation declarations of class template specializations Unknown
970[dcl.attr] CD2 Consistent use of “appertain” and “apply” Unknown
971[except.handle] C++11 Incorrect treatment of exception-declarations Unknown
972[dcl.attr.grammar] C++11 Allowing multiple attribute-specifiers Unknown
973[except.spec] CD2 Function types in exception-specifications Unknown
974[expr.prim.lambda] CD3 Default arguments for lambdas Clang 3.3
975[expr.prim.lambda] CD3 Restrictions on return type deduction for lambdas Unknown
976[temp.deduct.conv] CD2 Deduction for const T& conversion operators Unknown
977[dcl.enum] CD3 When is an enumeration type complete? Clang 2.7
978[over.best.ics] CD2 Incorrect specification for copy initialization Unknown
979[dcl.decl] CD2 Position of attribute-specifier in declarator syntax Unknown
980[temp.explicit] CD2 Explicit instantiation of a member of a class template Unknown
981[basic.types] C++11 Constexpr constructor templates and literal types Unknown
982[dcl.init.list] NAD Initialization with an empty initializer list Unknown
983[expr.unary.op] CD2 Ambiguous pointer-to-member constant Unknown
984[dcl.spec.auto] CD2 “Deduced type” is unclear in auto type deduction Unknown
985[lex.digraph] C++11 Alternative tokens and user-defined literals Unknown
986[namespace.udir] CD2 Transitivity of using-directives versus qualified lookup Unknown
987[basic.namespace] CD4 Which declarations introduce namespace members? Unknown
988[dcl.type.simple] CD2 Reference-to-reference collapsing with decltype Unknown
989[dcl.init.list] CD2 Misplaced list-initialization example Unknown
990[dcl.init.list] CD2 Value initialization with multiple initializer-list constructors Clang 3.5
991[dcl.constexpr] CD2 Reference parameters of constexpr functions and constructors Unknown
992[class.copy.ctor] NAD Inheriting explicitness Unknown
993[temp.point] C++11 Freedom to perform instantiation at the end of the translation unit Unknown
994[dcl.fct] C++11 braced-init-list as a default argument Unknown
995[temp.explicit] CD2 Incorrect example for using-declaration and explicit instantiation Unknown
996[temp.spec.partial] C++11 Ambiguous partial specializations of member class templates Unknown
997[basic.lookup.argdep] C++11 Argument-dependent lookup and dependent function template parameter types Unknown
998[dcl.fct] dup Function parameter transformations and template functions Unknown
999[over.match] CD2 “Implicit” or “implied” object argument/parameter? Unknown
1000[class.qual] CD2 Mistaking member typedefs for constructors Unknown
1001[dcl.fct] review Parameter type adjustment in dependent parameter types Not resolved
1002[temp.variadic] NAD Pack expansion for function arguments Unknown
1003[basic.start.main] CD3 Acceptable definitions of main Unknown
1004[temp.local] C++11 Injected-class-names as arguments for template template parameters Clang 5
1005[class.mfct.non.static] NAD Qualified name resolution in member functions of class templates Unknown
1006[temp.param] C++11 std::nullptr_t as a non-type template parameter Unknown
1007[class.protected] NAD Protected access and pointers to members Unknown
1008[expr.alignof] NAD Querying the alignment of an object Unknown
1009[temp] C++11 Missing cases in the declarator-id of a function template declaration Unknown
1010[expr.const] CD2 Address of object with dynamic storage duration in constant expression Unknown
1011[expr.static.cast] C++11 Standard conversions that cannot be inverted Unknown
1012[namespace.unnamed] C++11 Undeprecating static Unknown
1013[conv.lval] CD3 Uninitialized std::nullptr_t objects Unknown
1014[temp.deduct.call] NAD Overload resolution between const T& and T&& Unknown
1015[basic.lookup.argdep] C++11 Template arguments and argument-dependent lookup Unknown
1016[over] C++11 Overloadable declarations, function templates, and references Unknown
1017[class.mfct.non.static] C++11 Member access transformation in unevaluated operands Unknown
1018[dcl.pre] C++11 Ambiguity between simple-declaration and attribute-declaration Unknown
1019[class.derived] dup Dependent simple-template-ids in base-specifiers and mem-initializers Unknown
1020[class.copy.ctor] C++11 Implicitly-defined copy constructors and explicit base class constructors Unknown
1021[namespace.memdef] CD4 Definitions of namespace members Unknown
1022[dcl.enum] C++11 Can an enumeration variable have values outside the values of the enumeration? Unknown
1023[temp.arg.nontype] dup thread_local objects as non-type template arguments Unknown
1024[lex.ccon] CD3 Limits on multicharacter literals Unknown
1025[temp.arg.nontype] C++11 Use of a reference as a non-type template argument Unknown
1026[basic.lval] NAD Cv-qualified non-class rvalues Unknown
1027[basic.life] review Type consistency and reallocation of scalar types Not resolved
1028[temp.dep.res] CD6 Dependent names in non-defining declarations Unknown
1029[class.dtor] C++11 Type of a destructor call Unknown
1030[dcl.init.aggr] C++11 Evaluation order in initializer-lists used in aggregate initialization Unknown
1031[dcl.attr.grammar] C++11 Optional elements in attributes Unknown
1032[temp.variadic] C++11 Empty pack expansions Unknown
1033[dcl.align] C++11 Restrictions on alignment attributes Unknown
1034[expr.prim.lambda] C++11 Attributes for return statements in lambdas Unknown
1035[class.mem] C++11 Omitted and required decl-specifiers Unknown
1036[dcl.align] C++11 Alignment attribute in an exception-declaration Unknown
1037[expr.delete] C++11 Requirements for operands of delete-expressions and deallocation functions Unknown
1038[over.over] CD7 Overload resolution of &x.static_func Unknown
1039[dcl.align] dup Coordinating C and C++ alignment specifications Unknown
1040[intro.multithread] NAD Memory model issues Unknown
1041[class.mem] dup alias-declarations as class members Unknown
1042[dcl.pre] C++11 Attributes in alias-declarations Clang 3.5
1043[temp.dep.type] C++11 Qualified name lookup in the current instantiation Unknown
1044[basic.scope.pdecl] C++11 Point of declaration for an alias-declaration Unknown
1045[temp.explicit] NAD Requiring explicit instantiation declarations Unknown
1046[temp.explicit] open What is a “use” of a class specialization? Not resolved
1047[temp.dep.constexpr] C++11 When is typeid value-dependent? Unknown
1048[expr.prim.lambda] CD3 auto deduction and lambda return type deduction. Clang 3.6
1049[class.copy.elision] open Copy elision through reference parameters of inline functions Not resolved
1050[basic.life] NAD Effects of thread support on object lifetime Unknown
1051[class.copy.ctor] C++11 Reference members and generated copy constructors Unknown
1052[class.copy.ctor] dup const non-static data member and PODness Unknown
1053[except.spec] NAD Terminate vs undefined behavior for noexcept violation Unknown
1054[stmt.expr] C++11 Lvalue-to-rvalue conversions in expression statements No
1055[basic.fundamental] C++11 Permissible uses of void Unknown
1056[temp.alias] C++11 Template aliases, member definitions, and the current instantiation Unknown
1057[temp.dep.type] C++11 decltype and the current instantiation Unknown
1058[dcl.init.ref] NAD Reference binding of incompatible array types Unknown
1059[basic.type.qualifier] CD3 Cv-qualified array types (with rvalues) Unknown
1060[expr.const] C++11 Scoped enumerators in integral constant expressions Unknown
1061[expr.new] C++11 Negative array bounds in a new-expression Unknown
1062[expr.prim.lambda] C++11 Syntax of attribute-specifiers in lambdas Unknown
1063[dcl.attr.override] C++11 [[hiding]] with non-attribute declarations Unknown
1064[class.copy.ctor] C++11 Defaulted move constructor for a union Unknown
1065[dcl.attr.override] C++11 [[hiding]] with [[override]] Unknown
1066[class.copy.assign] C++11 When is a copy/move assignment operator implicitly defined? Unknown
1067[dcl.attr.override] NAD [[hiding]], using-declarations, and multiple inheritance Unknown
1068[temp.param] C++11 Template aliases with default arguments and template parameter packs Unknown
1069[dcl.fct] C++11 Incorrect function type with trailing-return-type Unknown
1070[dcl.init.aggr] C++11 Missing initializer clauses in aggregate initialization Clang 3.5
1071[basic.types] C++11 Literal class types and trivial default constructors Unknown
1072[class.mem] C++11 Scoped enumerator with the same name as its containing class Unknown
1073[except.spec] C++11 Merging dynamic-exception-specifications and noexcept-specifications Unknown
1074[temp.dep.constexpr] C++11 Value-dependent noexcept-expressions Unknown
1075[dcl.type.simple] C++11 Grammar does not allow template alias in type-name Unknown
1076[basic.lval] CD5 Value categories and lvalue temporaries Unknown
1077[namespace.memdef] NAD Explicit specializations in non-containing namespaces Unknown
1078[dcl.init.list] NAD Narrowing and the usual arithmetic conversions Unknown
1079[over.ics.rank] C++11 Overload resolution involving aggregate initialization Unknown
1080[class.copy.ctor] C++11 Confusing relationship between templates and copy constructors Unknown
1081[class.dtor] C++11 Defaulted destructor and unusable operator delete Unknown
1082[class.copy.ctor] C++11 Implicit copy function if subobject has none? Unknown
1083[expr.call] C++11 Passing an object to ellipsis with non-trivial move constructor Unknown
1084[class.copy.ctor] NAD Conditions for a deleted move function Unknown
1085[class.copy.assign] NAD Move assignment operators and virtual bases Unknown
1086[expr.const.cast] C++11 const_cast to rvalue reference to function type Unknown
1087[over.match.copy] C++11 Additional applications of issue 899 Unknown
1088[temp.dep.constexpr] C++11 Dependent non-type template arguments Unknown
1089[basic.lookup.qual.general] open Template parameters in member selections Not resolved
1090[basic.align] C++11 Alignment of subobjects Unknown
1091[expr.mptr.oper] C++11 Inconsistent use of the term “object expression” Unknown
1092[class.copy.ctor] drafting Cycles in overload resolution during instantiation Not resolved
1093[dcl.init] CD3 Value-initializing non-objects Unknown
1094[expr.static.cast] C++11 Converting floating-point values to scoped enumeration types Unknown
1095[dcl.init.list] C++11 List-initialization of references Unknown
1096[temp] C++11 Missing requirement for template definitions Unknown
1097[dcl.init.aggr] NAD Aggregate initialization of function parameters Unknown
1098[expr.const] C++11 Pointer conversions in constant expressions Unknown
1099[expr.const] C++11 Infinite recursion in constexpr functions Unknown
1100[expr.const] C++11 constexpr conversion functions and non-type template arguments Unknown
1101[class.static.data] C++11 Non-integral initialized static data members Unknown
1102[intro.execution] C++11 Better example of undefined behavior Unknown
1103[lex.phases] C++11 Reversion of phase 1 and 2 transformations in raw string literals Unknown
1104[lex.digraph] C++11 Global-scope template arguments vs the <: digraph Unknown
1105[lex.name] C++11 Issues relating to TR 10176:2003 Unknown
1106[lex.nullptr] C++11 Need more detail in nullptr keyword description Unknown
1107[lex.ext] C++11 Overload resolution for user-defined integer literals Unknown
1108[lex.ext] NAD User-defined literals have not been implemented Unknown
1109[basic.def.odr] C++11 When is “use” a reference to the ODR meaning? Unknown
1110[basic.def.odr] NAD Incomplete return type should be allowed in decltype operand Clang 3.1
1111[basic.lookup.classref] C++11 Remove dual-scope lookup of member template names Partial
1112[basic.link] C++11 constexpr variables should have internal linkage like const Unknown
1113[basic.link] C++11 Linkage of namespace member of unnamed namespace Partial
1114[basic.life] C++11 Incorrect use of placement new in example Unknown
1115[basic.align] C++11 C-compatible alignment specification Unknown
1116[basic.life] CD4 Aliasing of union members Unknown
1117[expr] C++11 Incorrect note about xvalue member access expressions Unknown
1118[expr.prim.lambda.capture] NAD Implicit lambda capture via explicit copy constructor Unknown
1119[expr.ref] C++11 Missing case in description of member access ambiguity Unknown
1120[expr.reinterpret.cast] C++11 reinterpret_cast and void* Unknown
1121[expr.unary.op] C++11 Unnecessary ambiguity error in formation of pointer to member Unknown
1122[expr.sizeof] C++11 Circular definition of std::size_t Unknown
1123[expr.unary.noexcept] C++11 Destructors should be noexcept by default Unknown
1124[expr.mptr.oper] NAD Error in description of value category of pointer-to-member expression Unknown
1125[expr.const] C++11 Unclear definition of “potential constant expression” Unknown
1126[expr.const] C++11 constexpr functions in const initializers Unknown
1127[expr.const] C++11 Overload resolution in constexpr functions Unknown
1128[dcl.spec] C++11 attribute-specifiers in decl-specifier-seqs Unknown
1129[dcl.constexpr] C++11 Default nothrow for constexpr functions Unknown
1130[dcl.type.simple] C++11 Function parameter type adjustments and decltype Unknown
1131[dcl.type.elab] C++11 Template aliases in elaborated-type-specifiers Unknown
1132[dcl.attr.noreturn] NAD Keyword vs attribute for noreturn Unknown
1133[dcl.attr.override] C++11 Keywords vs attributes for control of hiding and overriding Unknown
1134[dcl.fct.def.default] C++11 When is an explicitly-defaulted function defined? Unknown
1135[dcl.fct.def.default] C++11 Explicitly-defaulted non-public special member functions Unknown
1136[dcl.fct.def.default] C++11 Explicitly-defaulted explicit constructors Unknown
1137[dcl.fct.def.default] C++11 Explicitly-defaulted virtual special member functions Unknown
1138[dcl.init.ref] C++11 Rvalue-ness check for rvalue reference binding is wrong Unknown
1139[dcl.init.ref] C++11 Rvalue reference binding to scalar xvalues Unknown
1140[class] C++11 Incorrect redefinition of POD class Unknown
1141[class.mem] NAD Non-static data member initializers have not been implemented Unknown
1142[class.mfct] C++11 friend declaration of member function of containing class Unknown
1143[class.mfct.non.static] NAD Move semantics for *this have not been implemented Unknown
1144[class.access.dcl] C++11 Remove access declarations Unknown
1145[class.ctor] C++11 Defaulting and triviality Unknown
1146[class.dtor] C++11 exception-specifications of defaulted functions Unknown
1147[class.dtor] C++11 Destructors should be default nothrow Unknown
1148[class.copy.elision] C++11 Copy elision and move construction of function parameters Unknown
1149[class.copy.ctor] C++11 Trivial non-public copy operators in subobjects Unknown
1150[class.inhctor] NAD Inheriting constructors have not been implemented N/A
1151[over.match.list] C++11 Overload resolution with initializer-list and non-list constructors Unknown
1152[over.match.viable] C++11 Rules for determining existence of implicit conversion sequence Unknown
1153[over.over] C++11 Type matching in address of overloaded function Unknown
1154[temp.arg.nontype] C++11 Address of thread_local variable as non-type template argument Unknown
1155[temp.arg.nontype] C++11 Internal-linkage non-type template arguments Unknown
1156[temp.func.order] C++11 Partial ordering in a non-call context Unknown
1157[temp.func.order] open Partial ordering of function templates is still underspecified Not resolved
1158[temp.alias] C++11 Recursive instantiation via alias template Unknown
1159[temp.alias] C++11 Class and enumeration definitions in template aliases Unknown
1160[temp.dep.type] C++11 Definitions of template members and the current instantiation Unknown
1161[temp.res] C++11 Dependent nested-name-specifier in a pointer-to-member declarator Unknown
1162[temp.res] NAD Dependent elaborated-type-specifiers in non-deduced contexts Unknown
1163[temp.explicit] NAD extern template prevents inlining functions not marked inline Unknown
1164[temp.deduct.call] C++11 Partial ordering of f(T&) and f(T&&) Unknown
1165[except.ctor] C++11 Exceptions when destroying array elements Unknown
1166[except.handle] C++11 exception-declarations that do not declare objects Unknown
1167[except.spec] C++11 function-try-blocks for destructors Unknown
1168[except.terminate] C++11 Additional reasons to call std::terminate Unknown
1169[cpp.predefined] C++11 Missing feature macro for strict pointer safety Unknown
1170[temp.deduct] C++11 Access checking during template argument deduction Unknown
1171[except.terminate] C++11 Partial stack unwinding with noexcept violation Unknown
1172[temp.deduct] drafting “instantiation-dependent” constructs Not resolved
1173[intro.execution] C++11 Unclear specification of effects of signal handling Unknown
1174[basic.def.odr] C++11 When is a pure virtual function “used?” Unknown
1175[lex.ext] C++11 Disambiguating user-defined literals Unknown
1176[intro.multithread] C++11 Definition of release sequence Unknown
1177[intro.multithread] C++11 Intra-thread dependency-ordered-before Unknown
1178[temp.deduct.decl] C++11 Deduction failure matching placement new Unknown
1179[temp.param] NAD Cv-qualification of non-type template parameters Unknown
1180[basic.align] C++11 Over-aligned class types Unknown
1181[basic.types] C++11 What is a “built-in type?” Unknown
1182[temp.variadic] C++11 Incorrect description of pack expansion syntax Unknown
1183[dcl.fct] C++11 Expansion of parameter packs in declarators Unknown
1184[temp.deduct.call] C++11 Argument conversions to nondeduced parameter types Unknown
1185[dcl.link] C++11 Misleading description of language linkage and member function types Unknown
1186[dcl.constexpr] C++11 Non-dependent constexpr violations in function templates Unknown
1187[basic.start.static] C++11 Problems in initialization example Unknown
1188[expr.const] C++11 Type punning in constant expressions Unknown
1189[intro.object] C++11 Address of distinct base class subobjects Unknown
1190[basic.stc.dynamic.safety] C++11 Operations on non-safely-derived pointers Unknown
1191[class.ctor] C++11 Deleted subobject destructors and implicitly-defined constructors Unknown
1192[basic.def.odr] C++11 Inadvertent change to ODR and templates Unknown
1193[expr.const] C++11 Use of address-constant pointers in constant expressions Unknown
1194[dcl.constexpr] C++11 Constexpr references Unknown
1195[dcl.constexpr] C++11 References to non-literal types in constexpr functions Unknown
1196[temp.explicit] C++11 Definition required for explicit instantiation after explicit specialization? Unknown
1197[expr.const] C++11 Constexpr arrays Unknown
1198[basic.types] C++11 Literal types and copy constructors Unknown
1199[dcl.constexpr] C++11 Deleted constexpr functions Unknown
1200[basic.lookup.unqual] CD6 Lookup rules for template parameters N/A
1201[basic.def] C++11 Are deleted and defaulted functions definitions? Unknown
1202[class.cdtor] C++11 Calling virtual functions during destruction Unknown
1203[class.static.data] dup Misleading note regarding initialized static data members Unknown
1204[stmt.iter] C++11 Specifiers in a for-range-declaration Unknown
1205[over.ics.ref] dup Lvalue reference binding and function viability Unknown
1206[temp.class] C++11 Defining opaque enumeration members of class templates Unknown
1207[class.mfct.non.static] C++11 Type of class member in trailing-return-type Unknown
1208[class.mfct.non.static] C++11 Explicit noexcept in defaulted definition Unknown
1209[basic.def.odr] open Is a potentially-evaluated expression in a template definition a “use?” Not resolved
1210[basic.scope.pdecl] C++11 Injection of elaborated-type-specifier in enumeration scope Unknown
1211[basic.align] open Misaligned lvalues Not resolved
1212[dcl.type.simple] C++11 Non-function-call xvalues and decltype Unknown
1213[expr.sub] CD3 Array subscripting and xvalues Clang 7
1214[dcl.init] C++11 Kinds of initializers Unknown
1215[class] C++11 Definition of POD struct Unknown
1216[except.spec] C++11 Exceptions “allowed” by a noexcept-specification Unknown
1217[dcl.fct.def.delete] NAD Are deleted functions implicitly noexcept? Unknown
1218[except.handle] C++11 What is the “currently-handled exception” in a multi-threaded program? Unknown
1219[basic.types] C++11 Non-static data member initializers in constant expressions Unknown
1220[basic.lookup.classref] C++11 Looking up conversion-type-ids Unknown
1221[temp.deduct.partial] open Partial ordering and reference collapsing Not resolved
1222[dcl.array] NAD Unnecessary restriction on auto array types Unknown
1223[stmt.ambig] CD7 Syntactic disambiguation and trailing-return-types Clang 17
1224[class.copy.ctor] C++11 constexpr defaulted copy constructors Unknown
1225[dcl.constexpr] C++11 constexpr constructors and virtual bases Unknown
1226[dcl.fct.default] CD3 Converting a braced-init-list default argument Unknown
1227[temp.deduct] CD3 Mixing immediate and non-immediate contexts in deduction failure Clang 3.0
1228[over.match.list] NAD Copy-list-initialization and explicit constructors Unknown
1229[over.match.list] C++11 Overload resolution with empty braced-init-list argument Unknown
1230[expr.unary.op] dup Confusing description of ambiguity of destructor name Unknown
1231[temp.variadic] C++11 Variadic templates requiring an empty pack expansion Unknown
1232[dcl.init.list] C++11 Creation of array temporaries using a braced-init-list Unknown
1233[temp.dep] C++11 Pack expansions and dependent calls Unknown
1234[dcl.name] C++11 abstract-declarator does not permit ... after ptr-operator Unknown
1235[temp.func.order] C++11 “Unused” ellipsis and default arguments in partial ordering Unknown
1236[dcl.init.ref] C++11 Inconsistently-interrelated examples Unknown
1237[class.temporary] C++11 Deprecated implicit copy assignment in example Unknown
1238[over.ics.rank] C++11 Overloading ambiguity binding reference to function Unknown
1239[lex.ext] C++11 Hexadecimal floating-point literals vs user-defined literals Unknown
1240[dcl.name] C++11 constexpr defaulted constructors Unknown
1241[class.dtor] C++11 Which members does a destructor destroy? Unknown
1242[class.base.init] C++11 Initializing variant class members Unknown
1243[dcl.decl] C++11 Misleading footnote regarding multiple-declarator declarations Unknown
1244[temp.type] C++11 Equivalence of alias templates and class templates Unknown
1245[temp.mem.func] C++11 Matching declarations involving decltype Unknown
1246[temp.param] C++11 Non-deduced non-final parameter packs Unknown
1247[dcl.typedef] CD4 Restriction on alias name appearing in type-id Unknown
1248[diff.iso] open Updating Annex C to C99 and C23 Not resolved
1249[expr.prim.lambda.capture] CD6 Cv-qualification of nested lambda capture Unknown
1250[class.virtual] CD3 Cv-qualification of incomplete virtual function return types Clang 3.9
1251[diff.conv] CD3 C compatibility: casting to unqualified void* Unknown
1252[over.load] CD6 Overloading member function templates based on dependent return type Unknown
1253[temp.spec] C++17 Generic non-template members Unknown
1254[basic.def.odr] NAD odr-use vs template arguments and constexpr functions Unknown
1255[expr.const] drafting Definition problems with constexpr functions Not resolved
1256[expr.const] open Unevaluated operands are not necessarily constant expressions Not resolved
1257[temp.res] open Instantiation via non-dependent references in uninstantiated templates Not resolved
1258[temp.point] CD5 “Instantiation context” differs from dependent lookup rules Unknown
1259[expr.delete] NAD Deleting a POD via a pointer to base Unknown
1260[basic.def.odr] CD3 Incorrect use of term “overloaded” in description of odr-use Unknown
1261[expr] CD3 Explicit handling of cv-qualification with non-class prvalues Unknown
1262[temp.deduct] CD3 Default template arguments and deduction failure Unknown
1263[dcl.init.ref] NAD Mismatch between rvalue reference binding and overload resolution Unknown
1264[expr.const] CD3 Use of this in constexpr constructor Unknown
1265[dcl.spec.auto] CD3 Mixed use of the auto specifier Clang 5
1266[lex.ext] open user-defined-integer-literal overflow Not resolved
1267[except.spec] CD3 Rvalue reference types in exception-specifications Unknown
1268[expr.reinterpret.cast] CD3 reinterpret_cast of an xvalue operand Unknown
1269[expr.dynamic.cast] CD3 dynamic_cast of an xvalue operand Unknown
1270[dcl.init.list] CD3 Brace elision in array temporary initialization Unknown
1271[temp.res] CD5 Imprecise wording regarding dependent types Unknown
1272[class.static.data] NAD Implicit definition of static data member of const literal type Unknown
1273[temp.deduct] NAD Accessibility and function signatures Unknown
1274[stmt.ranged] CD4 Common nonterminal for expression and braced-init-list Unknown
1275[temp.param] CD3 Incorrect comment in example of template parameter pack restriction Unknown
1276[basic.fundamental] NAD Reference to stdint.h Unknown
1277[cstdint.syn] NAD Lax definition of intmax_t and uintmax_t Unknown
1278[over.call.func] drafting Incorrect treatment of contrived object Not resolved
1279[diff.cpp03] open Additional differences between C++ 2003 and C++ 2011 Not resolved
1280[basic.life] NAD Object reallocation and reference members Unknown
1281[temp.dep.type] NAD Virtual and dependent base classes Unknown
1282[except.spec] CD3 Underspecified destructor exception-specification Unknown
1283[class.static.data] open Static data members of classes with typedef name for linkage purposes Not resolved
1284[basic.life] CD4 Should the lifetime of an array be independent of that of its elements? Unknown
1285[basic.life] NAD Trivial destructors and object lifetime Unknown
1286[temp.alias] open Equivalence of alias templates Not resolved
1287[dcl.init.ref] C++14 Direct initialization vs “implicit” conversion in reference binding Unknown
1288[dcl.init.list] CD3 Reference list initialization Unknown
1289[temp.dep.type] NAD Can an alias template name the current instantiation? Unknown
1290[dcl.init.list] CD3 Lifetime of the underlying array of an initializer_list member Unknown
1291[basic.lookup.classref] CD6 Looking up a conversion-type-id N/A
1292[temp.dep] CD4 Dependent calls with braced-init-lists containing a pack expansion Unknown
1293[expr.const] CD3 String literals in constant expressions Unknown
1294[basic.start.static] open Side effects in dynamic/static initialization Not resolved
1295[dcl.init.ref] CD3 Binding a reference to an rvalue bit-field Clang 4
1296[temp.res] CD3 Ill-formed template declarations (not just definitions) Unknown
1297[dcl.decl] CD3 Misplaced function attribute-specifier Unknown
1298[over.ics.rank] CD3 Incorrect example in overload resolution Unknown
1299[class.temporary] CD5 “Temporary objects” vs “temporary expressions” Unknown
1300[expr.type.conv] dup T() for array types Unknown
1301[dcl.init] CD3 Value initialization of union Unknown
1302[basic.fundamental] CD3 noexcept applied to expression of type void Unknown
1303[temp] NAD C language linkage for template with internal linkage Unknown
1304[dcl.init.string] drafting Omitted array bound with string initialization Not resolved
1305[expr.alignof] CD3 alignof applied to array of unknown size Clang 3.0
1306[class.this] CD3 Modifying an object within a const member function Unknown
1307[over.ics.list] C++14 Overload resolution based on size of array initializer-list Clang 14
1308[class.mem] CD3 Completeness of class type within an exception-specification Superseded by 1330
1309[temp.dep.type] CD4 Incorrect note regarding lookup of a member of the current instantiation Unknown
1310[class.qual] CD3 What is an “acceptable lookup result?” Clang 5
1311[expr.const] CD3 Volatile lvalues in constant expressions Unknown
1312[expr.const] CD3 Simulated reinterpret_cast in constant expressions Unknown
1313[expr.const] CD3 Undefined pointer arithmetic in constant expressions Unknown
1314[expr.add] NAD Pointer arithmetic within standard-layout objects Unknown
1315[temp.spec.partial.general] CD4 Restrictions on non-type template arguments in partial specializations Partial
1316[dcl.constexpr] NAD constexpr function requirements and class scope Unknown
1317[dcl.enum] NAD Unnamed scoped enumerations Unknown
1318[class] CD3 Syntactic ambiguities with final Unknown
1319[temp.param] NAD Error in pack expansion example Unknown
1320[expr.static.cast] CD3 Converting scoped enumerations to bool Unknown
1321[temp.over.link] CD3 Equivalency of dependent calls Unknown
1322[temp.deduct] drafting Function parameter type decay in templates Not resolved
1323[dcl.attr.grammar] NAD Nonexistent nonterminal in alignment-specifier grammar Unknown
1324[dcl.init] CD3 Value initialization and defaulted constructors Unknown
1325[dcl.pre] NAD Omitted declarator in friend declarations Unknown
1326[temp.deduct.call] dup Deducing an array bound from an initializer-list Unknown
1327[dcl.fct.def.default] CD3 virt-specifier in a defaulted definition Unknown
1328[dcl.init.ref] CD3 Conflict in reference binding vs overload resolution Unknown
1329[implimits] CD3 Recursive deduction substitutions Unknown
1330[temp.deduct] CD3 Delayed instantiation of noexcept specifiers Clang 4 (C++11 onwards)
1331[dcl.fct.def.default] CD5 const mismatch with defaulted copy constructor Unknown
1332[lex.charset] CD5 Handling of invalid universal-character-names Unknown
1333[dcl.fct.def.default] CD3 Omission of const in a defaulted copy constructor Unknown
1334[basic.types] NAD Layout compatibility and cv-qualification Superseded by 1719
1335[cpp.stringize] CD6 Stringizing, extended characters, and universal-character-names Unknown
1336[class.conv.ctor] CD3 Definition of “converting constructor” Unknown
1337[temp.deduct.partial] dup Partial ordering and non-deduced parameters Unknown
1338[basic.stc.dynamic.allocation] CD4 Aliasing and allocation functions Unknown
1339[dcl.init] NAD Parenthesized braced-init-list and arrays Unknown
1340[expr.mptr.oper] CD3 Complete type in member pointer expressions Clang 2.9
1341[class.mem] NAD Bit-field initializers Superseded by P0683R1
1342[dcl.decl] CD6 Order of initialization with multiple declarators Unknown
1343[intro.execution] C++17 Sequencing of non-class initialization Unknown
1344[class.copy.ctor] C++14 Adding new special member functions to a class via default arguments Unknown
1345[class.base.init] CD3 Initialization of anonymous union class members Unknown
1346[dcl.spec.auto] CD3 expression-list initializers and the auto specifier Clang 3.5
1347[dcl.spec.auto] CD3 Consistency of auto in multiple-declarator declarations Clang 3.1
1348[dcl.spec.auto] drafting Use of auto in a trailing-return-type Not resolved
1349[temp.alias] dup Consistency of alias template redeclarations Unknown
1350[class.inhctor] CD3 Incorrect exception specification for inherited constructors Clang 3.5
1351[except.spec] CD4 Problems with implicitly-declared exception-specifications Unknown
1352[basic.scope.class] CD3 Inconsistent class scope and completeness rules Clang 3.0
1353[class.ctor] CD7 Array and variant members and deleted special member functions Unknown
1354[expr.unary.noexcept] CD3 Destructor exceptions for temporaries in noexcept expressions Unknown
1355[dcl.fct.def.default] CD3 Aggregates and “user-provided” constructors Unknown
1356[except.spec] CD4 Exception specifications of copy assignment operators with virtual bases Unknown
1357[class.mem] CD3 brace-or-equal-initializers for function and typedef members Unknown
1358[dcl.constexpr] CD3 Unintentionally ill-formed constexpr function template instances Clang 3.1
1359[dcl.constexpr] CD3 constexpr union constructors Clang 3.5
1360[class.ctor] CD6 constexpr defaulted default constructors Unknown
1361[basic.types] CD3 Requirement on brace-or-equal-initializers of literal types Unknown
1362[basic.def.odr] CD3 Complete type required for implicit conversion to T& Unknown
1363[class] CD3 Triviality vs multiple default constructors Unknown
1364[expr.const] CD3 constexpr function parameters Unknown
1365[expr.const] CD3 Calling undefined constexpr functions Unknown
1366[dcl.constexpr] CD3 Deleted constexpr constructors and virtual base classes Unknown
1367[expr.const] CD3 Use of this in a constant expression Unknown
1368[dcl.init] CD3 Value initialization and defaulted constructors (part 2) Unknown
1369[dcl.constexpr] CD3 Function invocation substitution of this Unknown
1370[cpp.replace] CD3 identifier-list cannot contain ellipsis Unknown
1371[temp.deduct.type] NAD Deduction from T&& in return types Unknown
1372[temp.deduct.conv] CD3 Cross-references incorrect in conversion function template argument deduction Unknown
1373[over.match.ref] dup Overload resolution changes matching reference-binding changes Unknown
1374[over.ics.rank] CD3 Qualification conversion vs difference in reference binding Unknown
1375[class.union] CD3 Reference to anonymous union? Unknown
1376[expr.static.cast] C++14 static_cast of temporary to rvalue reference Unknown
1377[diff.cpp03] dup Access declarations not mentioned in Annex C Unknown
1378[temp.inst] CD5 When is an instantiation required? Unknown
1379[dcl.init.list] NAD Is std::initializer_list an aggregate? Unknown
1380[dcl.fct] CD3 Type definitions in template-parameter parameter-declarations Unknown
1381[except.spec] CD3 Implicitly-declared special member functions and default nothrow Unknown
1382[dcl.decl] CD3 Dead code for constructor names Unknown
1383[expr] CD3 Clarifying discarded-value expressions Unknown
1384[expr.const] NAD reinterpret_cast in constant expressions Unknown
1385[over.match.oper] CD3 Syntactic forms of conversion functions for surrogate call functions Unknown
1386[temp.arg.explicit] NAD Explicitly-specified partial argument list with multiple parameter packs Unknown
1387[temp.deduct.type] CD3 Missing non-deduced context for decltype Unknown
1388[temp.deduct.call] CD3 Missing non-deduced context following a function parameter pack Clang 4
1389[dcl.fct] NAD Recursive reference in trailing-return-type Unknown
1390[temp.dep.type] drafting Dependency of alias template specializations Not resolved
1391[temp.arg.explicit] CD4 Conversions to parameter types with non-deduced template arguments Partial
1392[over.match.ref] CD3 Explicit conversion functions for references and non-references Unknown
1393[temp.variadic] C++17 Pack expansions in using-declarations Unknown
1394[dcl.fct] CD3 Incomplete types as parameters of deleted functions Clang 15
1395[temp.deduct.type] C++17 Partial ordering of variadic templates reconsidered Clang 16
1396[temp.inst] C++23 Deferred instantiation and checking of non-static data member initializers Unknown
1397[class.mem] CD4 Class completeness in non-static data member initializers Clang 3.2
1398[temp.arg.nontype] CD3 Non-type template parameters of type std::nullptr_t Unknown
1399[temp.deduct.call] CD3 Deduction with multiple function parameter packs Duplicate of 1388
1400[expr.eq] NAD Function pointer equality Unknown
1401[dcl.init.ref] CD3 Similar types and reference compatibility Unknown
1402[class.copy.ctor] CD3 Move functions too often deleted Unknown
1403[lex.comment] CD6 Universal-character-names in comments Unknown
1404[class.union] open Object reallocation in unions Not resolved
1405[basic.types] CD3 constexpr and mutable members of literal types Unknown
1406[temp.func.order] CD3 ref-qualifiers and added parameters of non-static member function templates Unknown
1407[expr.const] NAD Integral to bool conversion in converted constant expressions Unknown
1408[over.ics.rank] CD3 What is “the same aggregate initialization?” Unknown
1409[over.ics.list] CD3 What is the second standard conversion sequence of a list-initialization sequence? Unknown
1410[over.ics.rank] CD3 Reference overload tiebreakers should apply to rvalue references Unknown
1411[class] CD3 More on global scope :: in nested-name-specifier Unknown
1412[expr.static.cast] CD3 Problems in specifying pointer conversions Unknown
1413[temp.dep.constexpr] CD3 Missing cases of value-dependency Clang 12
1414[dcl.init.ref] drafting Binding an rvalue reference to a reference-unrelated lvalue Not resolved
1415[basic.link] CD3 Missing prohibition of block-scope definition of extern object Unknown
1416[expr.typeid] CD3 Function cv-qualifiers and typeid Unknown
1417[dcl.fct] C++14 Pointers/references to functions with cv-qualifiers or ref-qualifier Unknown
1418[dcl.init.list] CD3 Type of initializer_list backing array Unknown
1419[dcl.init.list] NAD Evaluation order in aggregate initialization Unknown
1420[class.abstract] NAD Abstract final classes Unknown
1421[dcl.init.list] NAD Full expressions and aggregate initialization Unknown
1422[lex.ccon] dup Type of character literals containing universal-character-names Unknown
1423[conv.fctptr] CD3 Convertibility of nullptr to bool Clang 11
1424[except.ctor] C++14 When must sub-object destructors be accessible? Unknown
1425[class.mem] CD3 Base-class subobjects of standard-layout structs N/A (ABI constraint)
1426[dcl.fct.def.default] CD5 Allowing additional parameter types in defaulted functions Unknown
1427[class.ctor] NAD Default constructor and deleted or inaccessible destructors Unknown
1428[basic.type.qualifier] CD3 Dynamic const objects Unknown
1429[basic.scope.temp] NAD Scope of a member template's template parameter Unknown
1430[temp.alias] open Pack expansion into fixed alias template parameter list Not resolved
1431[except] CD3 Exceptions from other than throw-expressions Unknown
1432[temp.variadic] open Newly-ambiguous variadic template expansions @@ -8419,912 +9809,1064 @@

C++ defect report implementation status

1433[basic.scope.pdecl] NAD trailing-return-type and point of declaration Unknown
1434[dcl.init] NAD Parenthesized braced-init-list Unknown
1435[dcl.meaning] CD3 template-id as the declarator for a class template constructor Unknown
1436[cpp.cond] open Interaction of constant expression changes with preprocessor expressions Not resolved
1437[dcl.typedef] CD3 alignas in alias-declaration Unknown
1438[basic.stc.dynamic.safety] CD3 Non-dereference use of invalid pointers Unknown
1439[namespace.memdef] CD3 Lookup and friend template declarations Unknown
1440[expr.prim.general] CD3 Acceptable decltype-specifiers used as nested-name-specifiers Unknown
1441[intro.execution] C++14 Unclear wording for signal handler restrictions Unknown
1442[stmt.ranged] CD3 Argument-dependent lookup in the range-based for Unknown
1443[dcl.fct.default] NAD Default arguments and non-static data members Clang 2.7
1444[temp.param] drafting Type adjustments of non-type template parameters Not resolved
1445[stmt.ranged] dup Argument-dependent lookup of begin and end Unknown
1446[temp.func.order] CD4 Member function with no ref-qualifier and non-member function with rvalue reference Unknown
1447[expr.static.cast] CD3 static_cast of bit-field lvalue to rvalue reference Unknown
1448[basic.fundamental] NAD Integral values of type bool Unknown
1449[dcl.init.list] CD3 Narrowing conversion of negative value to unsigned type Unknown
1450[expr.mul] CD3 INT_MIN % -1 Unknown
1451[temp.arg.nontype] CD4 Objects with no linkage in non-type template arguments Unknown
1452[expr.const] NAD Value-initialized objects may be constants Unknown
1453[basic.types] CD3 Volatile members in literal classes? Unknown
1454[expr.const] CD3 Passing constants through constexpr functions via references Unknown
1455[expr.const] CD3 Lvalue converted constant expressions Unknown
1456[expr.const] CD3 Address constant expression designating the one-past-the-end address Unknown
1457[expr.shift] CD3 Undefined behavior in left-shift Unknown
1458[expr.unary.op] CD3 Address of incomplete type vs operator&() Clang 3.1
1459[over.ics.rank] open Reference-binding tiebreakers in overload resolution Not resolved
1460[class.union] C++14 What is an empty union? Clang 3.5
1461[dcl.init.list] NAD Narrowing conversions to bit-fields Unknown
1462[temp.deduct] CD3 Deduction failure vs “ill-formed, no diagnostic required” Unknown
1463[temp.pre] drafting extern "C" alias templates Not resolved
1464[expr.new] CD3 Negative array bound in a new-expression Unknown
1465[expr.unary.noexcept] CD4 noexcept and std::bad_array_new_length Unknown
1466[intro.multithread] C++14 Visible sequences of side effects are redundant Unknown
1467[dcl.init.list] CD4 List-initialization of aggregate from same-type object Clang 3.7 (C++11 onwards)
1468[expr.prim.lambda.capture] CD5 typeid, overload resolution, and implicit lambda capture Unknown
1469[expr.new] CD5 Omitted bound in array new-expression Unknown
1470[intro.multithread] NAD Thread migration Unknown
1471[temp.dep.type] CD3 Nested type of non-dependent base Unknown
1472[basic.def.odr] CD3 odr-use of reference variables Unknown
1473[over.literal] CD3 Syntax of literal-operator-id Unknown
1474[lex.ext] NAD User-defined literals and <inttypes.h> format macros Unknown
1475[dcl.attr.depend] CD3 Errors in [[carries_dependency]] example Unknown
1476[intro.defs] CD3 Definition of user-defined type Unknown
1477[namespace.memdef] CD3 Definition of a friend outside its namespace Clang 2.7
1478[temp.names] CD6 template keyword for dependent template template arguments Unknown
1479[over.literal] CD3 Literal operators and default arguments Clang 3.1
1480[expr.const] CD3 Constant initialization via non-constant temporary Unknown
1481[over.inc] CD3 Increment/decrement operators with reference parameters Unknown
1482[basic.scope.pdecl] CD3 Point of declaration of enumeration Clang 3.0
1483[temp.res] NAD Non-dependent static_assert-declarations Unknown
1484[temp.inst] CD4 Unused local classes of function templates Unknown
1485[dcl.enum] drafting Out-of-class definition of member unscoped opaque enumeration Not resolved
1486[temp.deduct.funcaddr] drafting Base-derived conversion in member pointer deduction Not resolved
1487[class.inhctor] CD3 When are inheriting constructors declared? Clang 3.3
1488[dcl.name] drafting abstract-pack-declarators in type-ids Not resolved
1489[basic.start.static] CD3 Is value-initialization of an array constant initialization? Unknown
1490[dcl.init.list] CD4 List-initialization from a string literal Clang 3.7 (C++11 onwards)
1491[class.copy.ctor] CD3 Move construction and rvalue reference members Unknown
1492[class.dtor] CD4 Exception specifications on template destructors Unknown
1493[class.copy.ctor] C++14 Criteria for move-construction Unknown
1494[dcl.init.list] CD3 Temporary initialization for reference binding in list-initialization Unknown
1495[temp.spec.partial] CD3 Partial specialization of variadic class template Clang 4
1496[class.name] CD4 Triviality with deleted and missing default constructors No
1497[dcl.init.aggr] NAD Aggregate initialization with parenthesized string literal Unknown
1498[stmt.ranged] dup Lifetime of temporaries in range-based for Unknown
1499[class.copy.assign] CD7 Missing case for deleted move assignment operator Unknown
1500[temp.dep.candidate] CD6 Name lookup of dependent conversion function Unknown
1501[dcl.init.list] NAD Nested braces in list-initialization Unknown
1502[dcl.init] CD3 Value initialization of unions with member initializers Unknown
1503[except.throw] CD3 Exceptions during copy to exception object Unknown
1504[expr.add] CD3 Pointer arithmetic after derived-base conversion Unknown
1505[dcl.init.list] dup Direct binding of reference to temporary in list-initialization Unknown
1506[dcl.init.list] CD3 Value category of initializer_list object Unknown
1507[dcl.init] CD3 Value initialization with trivial inaccessible default constructor Unknown
1508[dcl.init.list] C++14 Template initializer-list constructors Unknown
1509[intro.defs] C++14 Definition of “non-template function” Unknown
1510[dcl.ref] CD3 cv-qualified references via decltype Unknown
1511[basic.def.odr] CD3 const volatile variables and the one-definition rule Unknown
1512[expr.rel] CD3 Pointer comparison vs qualification conversions Clang 4
1513[temp.deduct.call] drafting initializer_list deduction failure Not resolved
1514[class.bit] C++14 Ambiguity between enumeration definition and zero-length bit-field Clang 11
1515[basic.fundamental] CD3 Modulo 2n arithmetic for implicitly-unsigned types Unknown
1516[expr.call] CD3 Definition of “virtual function call” Unknown
1517[class.cdtor] open Unclear/missing description of behavior during construction/destruction Not resolved
1518[dcl.init.list] CD4 Explicit default constructors and copy-list-initialization Clang 4
1519[temp.variadic] NAD Conflicting default and variadic constructors Unknown
1520[temp.alias] NAD Alias template specialization vs pack expansion Unknown
1521[expr.type.conv] dup T{expr} with reference types Unknown
1522[dcl.init.list] CD3 Access checking for initializer_list array initialization Unknown
1523[stmt.ranged] CD5 Point of declaration in range-based for Unknown
1524[temp.dep.type] drafting Incompletely-defined class template base Not resolved
1525[expr.type.conv] NAD Array bound inference in temporary array Unknown
1526[temp.dep] dup Dependent-class lookup in the current instantiation Unknown
1527[expr.assign] CD3 Assignment from braced-init-list Unknown
1528[dcl.decl] CD3 Repeated cv-qualifiers in declarators Unknown
1529[basic.pre] drafting Nomenclature for variable vs reference non-static data member Not resolved
1530[basic.life] drafting Member access in out-of-lifetime objects Not resolved
1531[intro.defs] CD3 Definition of “access” (verb) Unknown
1532[temp.explicit] CD3 Explicit instantiation and member templates Unknown
1533[temp.variadic] CD3 Function pack expansion for member initialization Unknown
1534[basic.lval] dup cv-qualification of prvalue of type “array of class” Unknown
1535[expr.const] CD3 typeid in core constant expressions Unknown
1536[over.ics.list] drafting Overload resolution with temporary from initializer list Not resolved
1537[expr.const] CD3 Optional compile-time evaluation of constant expressions Unknown
1538[expr.assign] CD3 C-style cast in braced-init-list assignment Unknown
1539[basic.fundamental] CD3 Definition of “character type” Unknown
1540[expr.const] NAD Use of address constants in constant expressions Unknown
1541[stmt.return] CD3 cv void return types Unknown
1542[expr.assign] open Compound assignment of braced-init-list Not resolved
1543[over.ics.list] CD3 Implicit conversion sequence for empty initializer list Unknown
1544[dcl.stc] CD3 Linkage of member of unnamed namespace Unknown
1545[temp.friend] NAD friend function templates defined in class templates Unknown
1546[temp.deduct] NAD Errors in function template default arguments Unknown
1547[temp.res] NAD typename keyword in alias-declarations Unknown
1548[class.copy.ctor] open Copy/move construction and conversion functions Not resolved
1549[over.binary] open Overloaded comma operator with void operand Not resolved
1550[expr.cond] CD3 Parenthesized throw-expression operand of conditional-expression Clang 3.4
1551[namespace.udecl] C++14 Wording problems in using-declaration specification Unknown
1552[dcl.fct.def.default] CD4 exception-specifications and defaulted special member functions Unknown
1553[expr.sizeof] CD3 sizeof and xvalue bit-fields Unknown
1554[temp.alias] drafting Access and alias templates Not resolved
1555[expr.call] NAD Language linkage and function type compatibility Unknown
1556[over.match.copy] CD3 Constructors and explicit conversion functions in direct initialization Unknown
1557[expr.prim.lambda.closure] CD3 Language linkage of converted lambda function pointer Unknown
1558[temp.alias] CD4 Unused arguments in alias template specializations Clang 12
1559[expr.new] CD3 String too long in initializer list of new-expression Unknown
1560[expr.cond] CD3 Gratuitous lvalue-to-rvalue conversion in conditional-expression with throw-expression operand Clang 3.5
1561[dcl.init.aggr] CD4 Aggregates with empty base classes Unknown
1562[class.base.init] C++14 Non-static data member initializers and union ctor-initializer Unknown
1563[over.over] CD3 List-initialization and overloaded function disambiguation Clang 3.1
1564[dcl.spec.auto] NAD Template argument deduction from an initializer list Unknown
1565[dcl.init.list] NAD Copy elision and lifetime of initializer_list underlying array Unknown
1566[expr.new] NAD Should new std::initializer_list<T> be ill-formed? Unknown
1567[class.inhctor] C++14 Inheriting constructors and copy/move constructors Clang 3.3
1568[class.temporary] dup Temporary lifetime extension with intervening cast Unknown
1569[temp.deduct.type] C++14 Deducing a function parameter pack before ellipsis Unknown
1570[temp.arg.nontype] C++14 Address of subobject as non-type template argument Unknown
1571[dcl.init.ref] CD4 cv-qualification for indirect reference binding via conversion function Unknown
1572[dcl.init.ref] CD4 Incorrect example for rvalue reference binding via conversion function Unknown
1573[class.inhctor] CD4 Inherited constructor characteristics Clang 3.9
1574[dcl.fct.def.default] NAD Explicitly-defaulted constexpr functions in wrapper templates Unknown
1575[basic.stc.dynamic.safety] C++14 Incorrect definition of “strict pointer safety” Unknown
1576[expr] C++14 Discarded-value volatile xvalues Unknown
1577[temp.spec.partial.general] NAD Unnecessary restrictions on partial specializations Unknown
1578[dcl.init] NAD Value-initialization of aggregates Unknown
1579[class.copy.ctor] C++14 Return by converting move constructor Clang 3.9
1580[dcl.fct.default] drafting Default arguments in explicit instantiations Not resolved
1581[basic.def.odr] CD5 When are constexpr member functions defined? Unknown
1582[temp.deduct] drafting Template default arguments and deduction failure Not resolved
1583[intro.execution] C++14 Incorrect example of unspecified behavior Unknown
1584[temp.deduct.call] drafting Deducing function types from cv-qualified types @@ -9335,1836 +10877,2142 @@

C++ defect report implementation status

1585[expr.ref] NAD Value category of member access of rvalue reference member Unknown
1586[class.dtor] NAD Naming a destructor via decltype Unknown
1587[dcl.constexpr] C++14 constexpr initialization and nested anonymous unions Unknown
1588[dcl.spec.auto] CD3 Deducing cv-qualified auto Unknown
1589[over.ics.rank] CD4 Ambiguous ranking of list-initialization sequences Clang 3.7 (C++11 onwards)
1590[class.copy.ctor] CD4 Bypassing non-copy/move constructor copying Unknown
1591[temp.deduct.call] CD4 Deducing array bound and element type from initializer list Unknown
1592[temp.arg.template] C++14 When do template parameters match? Unknown
1593[class.copy.ctor] C++14 “Parameter type” of special member functions Unknown
1594[class.copy.ctor] drafting Lazy declaration of special members vs overload errors Not resolved
1595[dcl.constexpr] C++14 Constructors “involved in” subobject initialization Unknown
1596[expr.rel] CD4 Non-array objects as array[1] Unknown
1597[dcl.constexpr] CD3 Misleading constexpr example Unknown
1598[expr.eq] C++14 Criterion for equality of pointers to members Unknown
1599[dcl.init.list] CD4 Lifetime of initializer_list underlying array Unknown
1600[dcl.type.simple] CD4 Erroneous reference initialization in example Unknown
1601[conv.prom] C++14 Promotion of enumeration with fixed underlying type Clang 10
1602[temp.inst] review Linkage of specialization vs linkage of template arguments Not resolved
1603[basic.link] CD4 Errors resulting from giving unnamed namespaces internal linkage Unknown
1604[dcl.init.ref] C++14 Double temporaries in reference initialization Unknown
1605[class.dtor] CD3 Misleading parenthetical comment for explicit destructor call Unknown
1606[expr.sizeof] NAD sizeof closure class Clang 3.1
1607[expr.prim.lambda] C++14 Lambdas in template parameters Unknown
1608[over.match.oper] C++14 Operator lookup in trailing return type Unknown
1609[dcl.fct.default] open Default arguments and function parameter packs Not resolved
1610[temp.deduct.partial] drafting Cv-qualification in deduction of reference to array Not resolved
1611[class.ctor] C++14 Deleted default constructor for abstract class Duplicate of 1658
1612[expr.prim.lambda.capture] C++14 Implicit lambda capture and anonymous unions Unknown
1613[expr.prim.lambda.capture] C++14 Constant expressions and lambda capture Unknown
1614[basic.def.odr] CD4 Address of pure virtual function vs odr-use Unknown
1615[dcl.align] CD4 Alignment of types, variables, and members Unknown
1616[stmt.ambig] CD6 Disambiguation parsing and template parameters Unknown
1617[dcl.align] open alignas and non-defining declarations Not resolved
1618[dcl.enum] C++14 Gratuitously-unsigned underlying enum type Unknown
1619[temp.dep.type] open Definition of current instantiation Not resolved
1620[over.literal] open User-defined literals and extended integer types Not resolved
1621[class.base.init] C++20 Member initializers in anonymous unions Unknown
1622[dcl.init.aggr] C++17 Empty aggregate initializer for union Unknown
1623[class.ctor] drafting Deleted default union constructor and member initializers Not resolved
1624[except.ctor] NAD Destruction of union members with member initializers Unknown
1625[cpp.stringize] open Adding spaces between tokens in stringizing Not resolved
1626[expr.const] dup constexpr member functions in brace-or-equal-initializers Unknown
1627[dcl.align] NAD Agreement of dependent alignas specifiers Unknown
1628[expr.new] open Deallocation function templates Not resolved
1629[expr.prim.lambda.closure] C++14 Can a closure class be a literal type? Unknown
1630[dcl.init] CD4 Multiple default constructor templates Unknown
1631[over.ics.list] CD4 Incorrect overload resolution for single-element initializer-list Clang 3.7
1632[expr.prim.lambda.capture] CD5 Lambda capture in member initializers Unknown
1633[dcl.init] CD4 Copy-initialization in member initialization Unknown
1634[basic.stc] open Temporary storage duration Not resolved
1635[temp.param] drafting How similar are template default arguments to function default arguments? Not resolved
1636[dcl.enum] CD5 Bits required for negative enumerator values Unknown
1637[dcl.constexpr] NAD Recursion in constexpr template default constructor Unknown
1638[dcl.enum] CD4 Declaring an explicit specialization of a scoped enumeration Clang 3.1
1639[except.spec] CD4 exception-specifications and pointer/pointer-to-member expressions Unknown
1640[dcl.array] CD5 Array of abstract instance of class template Unknown
1641[class.base.init] NAD Assignment in member initializer Unknown
1642[expr.compound] CD7 Missing requirements for prvalue operands Unknown
1643[temp.param] NAD Default arguments for template parameter packs Unknown
1644[temp.over.link] NAD Equivalent exception-specifications in function template declarations Unknown
1645[class.inhctor] CD4 Identical inheriting constructors via default arguments Clang 3.9
1646[expr.call] CD5 decltype-specifiers, abstract classes, and deduction failure Unknown
1647[temp.spec.partial] drafting Type agreement of non-type template arguments in partial specializations Not resolved
1648[dcl.stc] C++14 thread_local vs block extern declarations Unknown
1649[class.base.init] C++14 Error in the syntax of mem-initializer-list Unknown
1650[dcl.init.ref] NAD Class prvalues in reference initialization Unknown
1651[class.temporary] NAD Lifetime extension of temporary via reference to subobject Unknown
1652[expr.eq] CD4 Object addresses in constexpr expressions Clang 3.6
1653[expr.pre.incr] CD4 Removing deprecated increment of bool Clang 4 (C++17 onwards)
1654[basic.types] dup Literal types and constexpr defaulted constructors Unknown
1655[lex.pptoken] open Line endings in raw string literals Not resolved
1656[lex.ccon] CD6 Encoding of numerically-escaped characters Unknown
1657[namespace.def] CD4 Attributes for namespaces and enumerators Unknown
1658[class.ctor] C++14 Deleted default constructor for abstract class via destructor Clang 5
1659[basic.start.static] open Initialization order of thread_local template static data members Not resolved
1660[class.mem] C++14 member-declaration requirements and unnamed bit-fields Unknown
1661[intro.multithread] NAD Preservation of infinite loops Unknown
1662[expr.prim.lambda.capture] C++14 Capturing function parameter packs Unknown
1663[expr.prim.lambda.capture] NAD Capturing an empty pack expansion Unknown
1664[expr.prim.lambda] C++14 Argument-dependent lookup of lambdas used in default arguments Unknown
1665[temp.explicit] drafting Declaration matching in explicit instantiations Not resolved
1666[temp.arg.nontype] C++14 Address constant expressions Unknown
1667[except.throw] NAD Function exiting via exception called by destructor during unwinding Unknown
1668[dcl.fct] drafting Parameter type determination still not clear enough Not resolved
1669[basic.start.main] C++14 auto return type for main Unknown
1670[dcl.spec.auto] review auto as conversion-type-id Not resolved
1671[temp.deduct.call] NAD Unclear rules for deduction with cv-qualification Unknown
1672[class.mem] CD4 Layout compatibility with multiple empty bases Clang 7
1673[over.best.ics] C++14 Clarifying overload resolution for the second step of copy-initialization Unknown
1674[dcl.spec.auto] C++14 Return type deduction for address of function Unknown
1675[implimits] NAD Size limit for automatic array object Unknown
1676[basic.stc.dynamic.allocation] drafting auto return type for allocation and deallocation functions Not resolved
1677[basic.start.static] C++17 Constant initialization via aggregate initialization Unknown
1678[expr.sizeof] NAD Naming the type of an array of runtime bound Unknown
1679[stmt.ranged] NAD Range-based for and array of runtime bound Unknown
1680[stmt.ranged] drafting Including <initializer_list> for range-based for Not resolved
1681[expr.prim.lambda.capture] C++14 init-captures and nested lambdas Unknown
1682[basic.stc.dynamic.allocation] open Overly-restrictive rules on function templates as allocation functions Not resolved
1683[expr.const] CD4 Incorrect example after constexpr changes Unknown
1684[dcl.constexpr] C++14 Static constexpr member functions for non-literal classes Clang 3.6
1685[expr.unary.noexcept] NAD Value category of noexcept expression Unknown
1686[basic.link] CD4 Which variables are “explicitly declared const?” Unknown
1687[over.match.oper] C++14 Conversions of operands of built-in operators Clang 7
1688[dcl.constexpr] NAD Volatile constexpr variables Unknown
1689[dcl.attr.grammar] C++14 Syntactic nonterminal for operand of alignas Unknown
1690[basic.lookup.argdep] C++14 Associated namespace for local type Clang 9
1691[basic.lookup.argdep] C++14 Argument-dependent lookup and opaque enumerations Clang 9
1692[basic.lookup.argdep] C++14 Associated namespaces of doubly-nested classes Clang 9
1693[class.mem] C++14 Superfluous semicolons in class definitions Unknown
1694[expr.const] CD4 Restriction on reference to temporary as a constant expression Unknown
1695[class.temporary] NAD Lifetime extension via init-capture Unknown
1696[class.temporary] CD4 Temporary lifetime and non-static data member initializers Clang 7
1697[class.temporary] CD4 Lifetime extension and copy elision Unknown
1698[lex.phases] CD7 Files ending in \ Unknown
1699[class.friend] open Does befriending a class befriend its friends? Not resolved
1700[temp.deduct.call] NAD Does the special rvalue-reference deduction apply to alias templates? Unknown
1701[basic.types] open Array vs sequence in object representation Not resolved
1702[class.union] drafting Rephrasing the definition of “anonymous union” Not resolved
1703[dcl.link] NAD Language linkage of names of functions with internal linkage Unknown
1704[temp.explicit] CD5 Type checking in explicit instantiation of variable templates Unknown
1705[temp.deduct.partial] CD4 Unclear specification of “more specialized” Unknown
1706[dcl.attr.grammar] drafting alignas pack expansion syntax Not resolved
1707[dcl.type.elab] C++14 template in elaborated-type-specifier without nested-name-specifier Unknown
1708[dcl.link] CD4 overly-strict requirements for names with C language linkage Unknown
1709[cpp.stringize] open Stringizing raw string literals containing newline Not resolved
1710[class.derived] C++17 Missing template keyword in class-or-decltype No
1711[temp.spec.partial] CD6 Missing specification of variable template partial specializations Unknown
1712[dcl.constexpr] CD4 constexpr variable template declarations Unknown
1713[dcl.link] dup Linkage of variable template specializations Unknown
1714[class.local] NAD odr-use of this from a local class Unknown
1715[class.inhctor] CD4 Access and inherited constructor templates Clang 3.9
1716[dcl.fct.default] C++14 When are default arguments evaluated? Unknown
1717[lex.icon] C++14 Missing specification of type of binary literal Unknown
1718[cpp.replace] open Macro invocation spanning end-of-file Not resolved
1719[class.mem] CD4 Layout compatibility and cv-qualification revisited Clang 19
1720[cpp.include] NAD Macro invocation in #include directive Unknown
1721[class.static.data] review Diagnosing ODR violations for static data members Not resolved
1722[expr.prim.lambda.closure] CD4 Should lambda to function pointer conversion function be noexcept? Clang 9
1723[lex.ext] open Multicharacter user-defined character literals Not resolved
1724[temp.deduct] CD6 Unclear rules for deduction failure Unknown
1725[dcl.spec.auto] NAD Trailing return type with nested function declarator Unknown
1726[class.conv.fct] CD6 Declarator operators and conversion function Unknown
1727[temp.expl.spec] NAD Type of a specialization of a variable template Unknown
1728[temp.explicit] CD5 Type of an explicit instantiation of a variable template Unknown
1729[temp.decls] CD6 Matching declarations and definitions of variable templates Unknown
1730[temp.decls] drafting Can a variable template have an unnamed type? Not resolved
1731[class.copy.ctor] NAD is_trivially_X and definitions of special member functions Unknown
1732[stmt.select] C++14 Defining types in conditions and range-based for statements Unknown
1733[dcl.fct.def.default] CD6 Return type and value for operator= with ref-qualifier Unknown
1734[class.copy.ctor] CD4 Nontrivial deleted copy functions No
1735[lex.ext] open Out-of-range literals in user-defined-literals Not resolved
1736[class.inhctor] CD4 Inheriting constructor templates in a local class Clang 3.9
1737[temp.dep.type] C++14 Type dependence of call to a member of the current instantiation Unknown
1738[class.inhctor] C++14 Explicit instantiation/specialization of inheriting constructor templates Superseded by P0136R1
1739[expr.static.cast] C++14 Conversion of floating point to enumeration Unknown
1740[except.spec] C++14 Disambiguation of noexcept Unknown
1741[basic.def.odr] C++14 odr-use of class object in lvalue-to-rvalue conversion Unknown
1742[namespace.udecl] CD5 using-declarations and scoped enumerators Unknown
1743[expr.prim.lambda.capture] NAD init-captures in nested lambdas Unknown
1744[basic.start.static] CD4 Unordered initialization for variable template specializations Unknown
1745[dcl.constexpr] NAD thread_local constexpr variable Unknown
1746[basic.types] C++14 Are volatile scalar types trivially copyable? Unknown
1747[basic.start.static] C++14 Constant initialization of reference to function Unknown
1748[expr.new] CD4 Placement new with a null pointer Clang 3.7
1749[basic.start.static] NAD Confusing definition for constant initializer Unknown
1750[over.match.copy] CD4 “Argument” vs “parameter” Unknown
1751[basic.life] CD4 Non-trivial operations vs non-trivial initialization Unknown
1752[class.base.init] CD4 Right-recursion in mem-initializer-list Unknown
1753[basic.lookup.qual] CD4 decltype-specifier in nested-name-specifier of destructor Clang 11
1754[temp.spec.partial] NAD Declaration of partial specialization of static data member template Unknown
1755[temp.spec.partial.member] drafting Out-of-class partial specializations of member templates Not resolved
1756[dcl.init.list] CD4 Direct-list-initialization of a non-class object Clang 3.7
1757[expr.const] CD4 Const integral subobjects Unknown
1758[over.match.list] CD4 Explicit conversion in copy/move list initialization Clang 3.7
1759[lex.string] C++14 UTF-8 code units in plain char Unknown
1760[expr.prim.lambda.capture] C++14 Access of member corresponding to init-capture Unknown
1761[dcl.array] NAD Runtime check on size of automatic array Unknown
1762[over.literal] C++14 Reserved identifier used in literal-operator-id example Clang 14
1763[temp.deduct.type] open Length mismatch in template type deduction Not resolved
1764[class.member.lookup] C++14 Hiding of function from using-declaration by signature Unknown
1765[dcl.enum] C++14 Overflow of enumeration used as enumerator value Unknown
1766[dcl.enum] CD4 Values outside the range of the values of an enumeration Unknown
1767[stmt.switch] C++14 Scoped enumeration in a switch statement Unknown
1768[dcl.array] NAD Zero-element array of runtime bound Unknown
1769[except.handle] C++14 Catching a base class of the exception object Unknown
1770[temp.deduct.type] C++14 Type matching of non-type template parameters and arguments Unknown
1771[basic.lookup.qual] CD6 Restricted lookup in nested-name-specifier Unknown
1772[expr.prim.lambda] C++14 __func__ in a lambda body Clang 14
1773[conv.lval] C++14 Out-of-lifetime lvalue-to-rvalue conversion Unknown
1774[except.ctor] CD4 Discrepancy between subobject destruction and stack unwinding Unknown
1775[lex.phases] C++14 Undefined behavior of line splice in raw string literal Unknown
1776[basic.life] CD4 Replacement of class objects containing reference members Unknown
1777[except.spec] CD4 Empty pack expansion in dynamic-exception-specification Unknown
1778[dcl.fct.def.default] C++14 exception-specification in explicitly-defaulted functions Clang 9
1779[temp.dep.expr] CD4 Type dependency of __func__ Clang 14
1780[expr.prim.lambda.closure] CD4 Explicit instantiation/specialization of generic lambda operator() Unknown
1781[over.match.conv] CD5 Converting from nullptr_t to bool in overload resolution Unknown
1782[dcl.init] CD4 Form of initialization for nullptr_t to bool conversion Unknown
1783[class.dtor] NAD Why are virtual destructors non-trivial? Unknown
1784[stmt.dcl] C++17 Concurrent execution during static local initialization Unknown
1785[temp.res] NAD Conflicting diagnostic requirements for template definitions Unknown
1786[expr.new] C++14 Effect of merging allocations on memory leakage Unknown
1787[conv.lval] C++14 Uninitialized unsigned char values Unknown
1788[expr.delete] CD4 Sized deallocation of array of non-class type Unknown
1789[over.ics.rank] open Array reference vs array decay in overload resolution Not resolved
1790[dcl.fct] open Ellipsis following function parameter pack Not resolved
1791[dcl.fct.def.general] CD4 Incorrect restrictions on cv-qualifier-seq and ref-qualifier Unknown
1792[temp.expl.spec] NAD Incorrect example of explicit specialization of member enumeration Unknown
1793[dcl.stc] CD4 thread_local in explicit specializations Unknown
1794[temp.names] C++17 template keyword and alias templates Clang 2.7
1795[namespace.def] CD4 Disambiguating original-namespace-definition and extension-namespace-definition Unknown
1796[lex.charset] CD4 Is all-bits-zero for null characters a meaningful requirement? Unknown
1797[basic.fundamental] CD4 Are all bit patterns of unsigned char distinct numbers? Unknown
1798[except.spec] NAD exception-specifications of template arguments Unknown
1799[dcl.stc] CD4 mutable and non-explicit const qualification Unknown
1800[expr.unary.op] CD4 Pointer to member of nested anonymous union Clang 2.9
1801[class.union] CD4 Kind of expression referring to member of anonymous union Clang 2.8
1802[lex.string] CD4 char16_t string literals and surrogate pairs Clang 3.1
1803[class.mem] CD5 opaque-enum-declaration as member-declaration Clang 2.9
1804[temp.friend] CD4 Partial specialization and friendship Clang 2.7
1805[expr.cond] CD4 Conversions of array operands in conditional-expressions Unknown
1806[class.copy.assign] CD4 Virtual bases and move-assignment Unknown
1807[except.ctor] CD4 Order of destruction of array elements after an exception Clang 3.0
1808[class.ctor] drafting Constructor templates vs default constructors Not resolved
1809[temp.deduct] CD4 Narrowing and template argument deduction Unknown
1810[lex.ext] CD4 Invalid ud-suffixes Unknown
1811[class.dtor] CD4 Lookup of deallocation function in a virtual destructor definition Unknown
1812[temp.names] C++17 Omission of template in a typename-specifier No
1813[class] CD4 Direct vs indirect bases in standard-layout classes Clang 7
1814[dcl.fct.default] CD4 Default arguments in lambda-expressions Clang 3.1
1815[dcl.init.aggr] CD4 Lifetime extension in aggregate initialization Clang 20
1816[conv.integral] CD4 Unclear specification of bit-field values Unknown
1817[dcl.link] open Linkage specifications and nested scopes Not resolved
1818[dcl.link] CD6 Visibility and inherited language linkage Clang 3.4
1819[temp.spec.partial.general] CD4 Acceptable scopes for definition of partial specialization Unknown
1820[dcl.typedef] CD6 Qualified typedef names Clang 3.5
1821[class.mem] CD6 Qualified redeclarations in a class member-specification Clang 2.9
1822[expr.prim.lambda] CD6 Lookup of parameter names in lambda-expressions Clang 3.1
1823[dcl.fct.spec] CD4 String literal uniqueness in inline functions Unknown
1824[dcl.fct] CD4 Completeness of return type vs point of instantiation Clang 2.7
1825[temp.deduct.partial] C++17 Partial ordering between variadic and non-variadic function templates Unknown
1826[expr.const] NAD const floating-point in constant expressions Unknown
1827[dcl.init.ref] drafting Reference binding with ambiguous conversions Not resolved
1828[basic.lookup.qual] CD6 nested-name-specifier ambiguity Unknown
1829[temp.dep.type] CD6 Dependent unnamed types Unknown
1830[dcl.pre] CD4 Repeated specifiers Unknown
1831[class.copy.ctor] NAD Explicitly vs implicitly deleted move constructors Unknown
1832[expr.static.cast] CD4 Casting to incomplete enumeration Clang 3.0
1833[class.friend] NAD friend declarations naming implicitly-declared member functions Unknown
1834[basic.start.static] CD4 Constant initialization binding a reference to an xvalue Unknown
1835[basic.lookup.classref] CD6 Dependent member lookup before < Unknown
1836[expr.prim.general] CD5 Use of class type being defined in trailing-return-type Unknown
1837[expr.prim.general] CD6 Use of this in friend and local class declarations Clang 3.3
1838[namespace.memdef] CD4 Definition via unqualified-id and using-declaration Unknown
1839[basic.link] CD6 Lookup of block-scope extern declarations Unknown
1840[temp.expl.spec] drafting Non-deleted explicit specialization of deleted function template Not resolved
1841[temp.local] CD6 < following template injected-class-name Unknown
1842[intro.multithread] open Unevaluated operands and “carries a dependency” Not resolved
1843[expr.cond] CD4 Bit-field in conditional operator with throw operand Unknown
1844[temp.deduct] open Defining “immediate context” Not resolved
1845[temp.point] review Point of instantiation of a variable template specialization Not resolved
1846[dcl.fct.def.default] CD4 Declaring explicitly-defaulted implicitly-deleted functions Unknown
1847[temp.deduct.type] CD4 Clarifying compatibility during partial ordering Unknown
1848[class.dtor] CD4 Parenthesized constructor and destructor declarators Unknown
1849[basic.def.odr] CD6 Variable templates and the ODR Unknown
1850[temp.res] CD4 Differences between definition context and point of instantiation Unknown
1851[expr.new] CD4 decltype(auto) in new-expressions Unknown
1852[dcl.type.simple] CD4 Wording issues regarding decltype(auto) Unknown
1853[basic.life] dup Defining “allocated storage” Unknown
1854[dcl.fct.def.default] drafting Disallowing use of implicitly-deleted functions Not resolved
1855[class.cdtor] dup Out-of-lifetime access to nonstatic data members Unknown
1856[temp.inst] open Indirect nested classes of class templates Not resolved
1857[expr.shift] CD5 Additional questions about bits Unknown
1858[expr.eq] CD4 Comparing pointers to union members Unknown
1859[lex.string] CD5 UTF-16 in char16_t string literals Unknown
1860[class.union] C++17 What is a “direct member?” Unknown
1861[class.bit] CD4 Values of a bit-field Unknown
1862[temp.friend] CD5 Determining “corresponding members” for friendship No
1863[except.throw] CD4 Requirements on thrown object type to support std::current_exception() Unknown
1864[dcl.init.list] NAD List-initialization of array objects Unknown
1865[expr.add] CD4 Pointer arithmetic and multi-level qualification conversions Unknown
1866[except.ctor] CD4 Initializing variant members with non-trivial destructors Unknown
1867[dcl.ambig.res] NAD Function/expression ambiguity with qualified parameter name Unknown
1868[dcl.spec.auto] open Meaning of “placeholder type” Not resolved
1869[dcl.link] NAD thread_local vs linkage-specifications Unknown
1870[basic.def] CD4 Contradictory wording about definitions vs explicit specialization/instantiation Unknown
1871[lex.ext] NAD Non-identifier characters in ud-suffix Unknown
1872[dcl.constexpr] CD4 Instantiations of constexpr templates that cannot appear in constant expressions Clang 9
1873[class.access.base] CD4 Protected member access from derived class friends Unknown
1874[temp.param] CD4 Type vs non-type template parameters with class keyword Unknown
1875[basic.scope.class] CD4 Reordering declarations in class scope Unknown
1876[temp.expl.spec] NAD Preventing explicit specialization Unknown
1877[dcl.spec.auto] CD4 Return type deduction from return with no operand Unknown
1878[dcl.spec.auto] CD4 operator auto template Clang 18
1879[basic.align] NAD Inadequate definition of alignment requirement Unknown
1880[expr.call] CD4 When are parameter objects destroyed? Unknown
1881[class] CD4 Standard-layout classes and unnamed bit-fields Clang 7
1882[global.names] CD4 Reserved names without library use Unknown
1883[class.protected] review Protected access to constructors in mem-initializers Not resolved
1884[basic.link] CD6 Unclear requirements for same-named external-linkage entities Partial
1885[expr.call] CD4 Return value of a function is underspecified Unknown
1886[basic.start.main] CD4 Language linkage for main() Unknown
1887[namespace.udecl] CD4 Problems with :: as nested-name-specifier Unknown
1888[class.ctor] CD4 Implicitly-declared default constructors and explicit Unknown
1889[cpp.pragma] open Unclear effect of #pragma on conformance Not resolved
1890[class.mem] drafting Member type depending on definition of member function @@ -11175,2670 +13023,3115 @@

C++ defect report implementation status

1891[expr.prim.lambda.closure] CD4 Move constructor/assignment for closure class Clang 4
1892[dcl.spec.auto] CD4 Use of auto in function type Unknown
1893[expr.type.conv] CD5 Function-style cast with braced-init-lists and empty pack expansions Unknown
1894[dcl.typedef] CD6 typedef-names and using-declarations Clang 3.8
1895[expr.cond] CD4 Deleted conversions in conditional operator operands Unknown
1896[temp.alias] CD6 Repeated alias templates Unknown
1897[basic.def.odr] review ODR vs alternative tokens Not resolved
1898[over.dcl] CD6 Use of “equivalent” in overload resolution Clang 2.7
1899[temp.dep.constexpr] CD4 Value-dependent constant expressions Unknown
1900[dcl.meaning] CD6 Do friend declarations count as “previous declarations”? Clang 2.7
1901[lex.token] open punctuator referenced but not defined Not resolved
1902[over.best.ics] CD4 What makes a conversion “otherwise ill-formed”? Clang 3.7
1903[namespace.udecl] CD4 What declarations are introduced by a non-member using-declaration? Clang 2.7
1904[temp.param] NAD Default template arguments for members of class templates Unknown
1905[temp.dep.type] NAD Dependent types and injected-class-names Unknown
1906[basic.lookup.unqual] NAD Name lookup in member friend declaration Unknown
1907[namespace.udecl] CD6 using-declarations and default arguments Unknown
1908[basic.lookup.classref] CD6 Dual destructor lookup and template-ids Unknown
1909[class.mem] CD4 Member class template with the same name as the class Clang 3.7
1910[basic.stc.dynamic.allocation] CD5 “Shall” requirement applied to runtime behavior Unknown
1911[dcl.constexpr] CD4 constexpr constructor with non-literal base class Unknown
1912[dcl.fct.def.default] CD5 exception-specification of defaulted function Unknown
1913[expr.prim.lambda] CD5 decltype((x)) in lambda-expressions Unknown
1914[dcl.attr] extension Duplicate standard attributes Extension
1915[class.base.init] open Potentially-invoked destructors in non-throwing constructors Not resolved
1916[class.copy.ctor] CD4 “Same cv-unqualified type” Unknown
1917[dcl.enum] NAD decltype-qualified enumeration names Unknown
1918[temp.friend] CD5 friend templates with dependent scopes No
1919[over.match.oper] open Overload resolution for ! with explicit conversion operator Not resolved
1920[expr.pseudo] CD4 Qualification mismatch in pseudo-destructor-name Unknown
1921[expr.const] NAD constexpr constructors and point of initialization of const variables Unknown
1922[temp.local] CD4 Injected class template names and default arguments Unknown
1923[expr.unary.op] NAD Lvalues of type void Unknown
1924[lex.literal] review Definition of “literal” and kinds of literals Not resolved
1925[expr.comma] CD4 Bit-field prvalues Unknown
1926[basic.def.odr] CD4 Potential results of subscript operator Unknown
1927[expr.prim.lambda.capture] dup Lifetime of temporaries in init-captures Unknown
1928[class.copy.ctor] NAD Triviality of deleted special member functions Unknown
1929[expr.prim.general] CD4 template keyword following namespace nested-name-specifier Unknown
1930[dcl.stc] CD4 init-declarator-list vs member-declarator-list Unknown
1931[expr.prim.lambda.closure] CD5 Default-constructible and copy-assignable closure types Unknown
1932[expr.cond] CD4 Bit-field results of conditional operators Unknown
1933[implimits] NAD Implementation limit for initializer-list elements Unknown
1934[except.spec] NAD Relaxing exception-specification compatibility requirements Unknown
1935[expr.new] CD5 Reuse of placement arguments in deallocation Unknown
1936[temp.dep] CD6 Dependent qualified-ids Unknown
1937[expr.prim.lambda.closure] CD5 Incomplete specification of function pointer from lambda Unknown
1938[intro.compliance] CD5 Should hosted/freestanding be implementation-defined? Unknown
1939[temp.deduct.call] open Argument conversions to nondeduced parameter types revisited Not resolved
1940[class.union] CD4 static_assert in anonymous unions Clang 3.5
1941[class.inhctor] CD4 SFINAE and inherited constructor default arguments Clang 3.9
1942[expr.prim.lambda] CD4 Incorrect reference to trailing-return-type Unknown
1943[class.bit] CD5 Unspecified meaning of “bit” Unknown
1944[diff] open New C incompatibilities Not resolved
1945[temp.friend] CD5 Friend declarations naming members of class templates in non-templates No
1946[except.spec] CD4 exception-specifications vs pointer dereference Unknown
1947[lex.icon] NAD Digit separators following non-octal prefix Clang 3.5
1948[basic.stc.dynamic] NAD exception-specification of replacement global new Clang 3.5
1949[intro.execution] CD4 “sequenced after” instead of “sequenced before” Unknown
1950[over.ics.rank] NAD Restructuring description of ranks of conversion sequences Unknown
1951[basic.types] CD4 Cv-qualification and literal types Unknown
1952[expr.const] CD4 Constant expressions and library undefined behavior Unknown
1953[intro.memory] CD7 Data races and common initial sequence Unknown
1954[expr.typeid] CD7 typeid null dereference check in subexpressions Unknown
1955[cpp.cond] CD4 #elif with invalid controlling expression Unknown
1956[basic.stc.auto] CD4 Reuse of storage of automatic variables Unknown
1957[dcl.spec.auto] NAD decltype(auto) with direct-list-initialization Unknown
1958[dcl.spec.auto] CD4 decltype(auto) with parenthesized initializer Unknown
1959[class.inhctor] CD4 Inadvertently inherited copy constructor Clang 3.9
1960[namespace.udecl] NAD Visibility of entity named in class-scope using-declaration No
1961[intro.multithread] C++17 Potentially-concurrent actions within a signal handler Unknown
1962[dcl.fct.def.general] open Type of __func__ Not resolved
1963[lex.name] CD4 Implementation-defined identifier characters Unknown
1964[dcl.typedef] NAD opaque-enum-declaration in alias-declaration? Unknown
1965[expr.dynamic.cast] CD7 Explicit casts to reference types Unknown
1966[dcl.enum] CD4 Colon following enumeration elaborated-type-specifier Clang 11
1967[class.copy.elision] CD4 Temporary lifetime and move-elision Unknown
1968[expr.const] NAD Address of typeid in constant expressions No
1969[class.dtor] CD6 Missing exclusion of ~S as an ordinary function name Unknown
1970[dcl.ambig.res] NAD Ambiguity resolution for (T())*x Unknown
1971[expr.unary.op] CD4 Unclear disambiguation of destructor and operator~ Unknown
1972[lex.name] CD6 Identifier character restrictions in non-identifiers Unknown
1973[expr.prim.lambda.closure] CD7 Which parameter-declaration-clause in a lambda-expression? Unknown
1974[temp.res] NAD Redundant specification of non-type typename-specifier Unknown
1975[except.spec] CD4 Permissible declarations for exception-specifications Unknown
1976[namespace.alias] NAD Ambiguity of namespace-aliases Unknown
1977[class.dtor] open Contradictory results of failed destructor lookup Not resolved
1978[class.conv.ctor] CD4 Redundant description of explicit constructor use Unknown
1979[temp.alias] drafting Alias template specialization in template member definition Not resolved
1980[temp.alias] drafting Equivalent but not functionally-equivalent redeclarations Not resolved
1981[conv] CD4 Implicit contextual conversions and explicit Unknown
1982[temp.arg.explicit] NAD Deduction extending parameter pack Unknown
1983[class.mem] CD5 Inappropriate use of virt-specifier Unknown
1984[dcl.init.list] NAD Lossless narrowing conversions Unknown
1985[dcl.init.aggr] NAD Unknown bound array member with brace-or-equal-initializer Unknown
1986[basic.start.static] drafting odr-use and delayed initialization Not resolved
1987[class.static.data] NAD constexpr static data members across translation units Unknown
1988[temp.dep.type] CD4 Ambiguity between dependent and non-dependent bases in implicit member access Unknown
1989[over.oper] drafting Insufficient restrictions on parameters of postfix operators Not resolved
1990[dcl.pre] CD4 Ambiguity due to optional decl-specifier-seq Unknown
1991[class.inhctor] CD4 Inheriting constructors vs default arguments Clang 3.9
1992[expr.new] CD4 new (std::nothrow) int[N] can throw Unknown
1993[temp.expl.spec] open Use of template<> defining member of explicit specialization Not resolved
1994[temp.expl.spec] dup Confusing wording regarding multiple template<> prefixes Duplicate of 529
1995[except.spec] CD4 exception-specifications and non-type template parameters Unknown
1996[dcl.init.list] drafting Reference list-initialization ignores conversion functions Not resolved
1997[basic.indet] CD7 Placement new and previous initialization Unknown
1998[basic.lval] NAD Additional sources of xvalue expressions Unknown
1999[lex.phases] CD4 Representation of source characters as universal-character-names Unknown
2000[lex.pptoken] CD4 header-name outside #include directive Unknown
2001[cpp.pre] CD4 non-directive is underspecified Unknown
2002[cpp.pre] open White space within preprocessing directives Not resolved
2003[cpp.replace] drafting Zero-argument macros incorrectly specified Not resolved
2004[expr.const] CD4 Unions with mutable members in constant expressions Unknown
2005[expr.const] NAD Incorrect constexpr reference initialization requirements Unknown
2006[basic.compound] CD4 Cv-qualified void types Unknown
2007[over.match.oper] CD6 Argument-dependent lookup for operator= Clang 3.4
2008[temp.arg] CD4 Default template-arguments underspecified Unknown
2009[basic.scope.class] CD6 Unclear specification of class scope N/A
2010[except.spec] CD4 exception-specifications and conversion operators Unknown
2011[expr.prim.lambda.capture] C++17 Unclear effect of reference capture of reference Unknown
2012[basic.stc] CD4 Lifetime of references Unknown
2013[expr.add] drafting Pointer subtraction in large array Not resolved
2014[new.delete.array] NAD Unneeded deallocation signatures Unknown
2015[dcl.fct.def.delete] CD4 odr-use of deleted virtual functions Unknown
2016[class.conv.fct] CD4 Confusing wording in description of conversion function Unknown
2017[stmt.return] CD4 Flowing off end is not equivalent to no-expression return Unknown
2018[dcl.init.ref] dup Qualification conversion vs reference binding Unknown
2019[basic.stc.general] CD4 Member references omitted from description of storage duration Unknown
2020[basic.def.odr] CD5 Inadequate description of odr-use of implicitly-invoked functions Unknown
2021[temp.over.link] dup Function template redeclaration via alias template Unknown
2022[expr.const] CD4 Copy elision in constant expressions Unknown
2023[expr.cond] drafting Composite reference result type of conditional operator Not resolved
2024[temp.dep.type] CD4 Dependent types and unexpanded parameter packs Unknown
2025[temp.over.link] dup Declaration matching via alias templates Unknown
2026[basic.start] CD4 Zero-initialization and constexpr Clang 11
2027[dcl.align] CD4 Unclear requirements for multiple alignas specifiers Unknown
2028[over.match.ref] drafting Converting constructors in rvalue reference initialization Not resolved
2029[expr.call] dup Abstract class return type in decltype operand Unknown
2030[class.access.base] NAD Access of injected-class-name with template arguments Unknown
2031[diff.cpp03.expr] CD4 Missing incompatibility for && Unknown
2032[temp.param] CD4 Default template-arguments of variable templates Unknown
2033[temp.spec.partial.general] CD4 Redundant restriction on partial specialization argument Unknown
2034[except.uncaught] NAD Deprecating uncaught_exception() Unknown
2035[temp.spec.partial.match] CD3 Multi-section example is confusing Unknown
2036[dcl.decl] NAD Refactoring parameters-and-qualifiers Unknown
2037[temp.type] drafting Alias templates and template declaration matching Not resolved
2038[diff.cpp14] CD4 Document C++14 incompatibility of new braced deduction rule Unknown
2039[except.spec] CD4 Constant conversions to bool Unknown
2040[dcl.decl] CD4 trailing-return-type no longer ambiguous Unknown
2041[temp.expl.spec] CD4 Namespace for explicit class template specialization Unknown
2042[basic.stc.dynamic.deallocation] review Exceptions and deallocation functions Not resolved
2043[temp.arg.nontype] drafting Generalized template arguments and array-to-pointer decay Not resolved
2044[dcl.spec.auto] CD4 decltype(auto) and void Unknown
2045[temp.over.link] CD5 “Identical” template parameter lists Unknown
2046[intro.multithread] C++17 Incomplete thread specifications Unknown
2047[except.spec] CD4 Coordinating “throws anything” specifications Unknown
2048[expr.static.cast] open C-style casts that cast away constness vs static_cast Not resolved
2049[temp.arg.nontype] CD7 List initializer in non-type template default argument Clang 18
2050[dcl.stc] NAD Consolidate specification of linkage Unknown
2051[basic.lval] CD5 Simplifying alias rules Unknown
2052[over.oper] CD4 Template argument deduction vs overloaded operators Unknown
2053[dcl.spec.auto] C++20 auto in non-generic lambdas Unknown
2054[temp.deduct] CD7 Missing description of class SFINAE Unknown
2055[temp.arg.explicit] drafting Explicitly-specified non-deduced parameter packs Not resolved
2056[class.base.init] open Member function calls in partially-initialized class objects Not resolved
2057[temp.arg.template] drafting Template template arguments with default arguments Not resolved
2058[basic.link] CD6 More errors from internal-linkage namespaces Unknown
2059[dcl.spec.auto] CD5 Linkage and deduced return types Unknown
2060[dcl.spec.auto] NAD Deduced return type for explicit specialization Unknown
2061[namespace.def] CD4 Inline namespace after simplifications Clang 2.7
2062[temp.class] CD6 Class template redeclaration requirements Unknown
2063[basic.scope.declarative] CD4 Type/nontype hiding in class scope Unknown
2064[temp.type] CD4 Conflicting specifications for dependent decltype-specifiers Unknown
2065[temp.dep.type] CD6 Current instantiation of a partial specialization Unknown
2066[temp.dep.constexpr] CD4 Does type-dependent imply value-dependent? Unknown
2067[temp.res] open Generated variadic templates requiring empty pack Not resolved
2068[class.dtor] CD4 When can/must a defaulted virtual destructor be defined? Unknown
2069[class.dtor] CD4 Do destructors have names? Unknown
2070[class.qual] CD6 using-declaration with dependent nested-name-specifier Unknown
2071[dcl.typedef] CD4 typedef with no declarator Unknown
2072[temp.inst] C++23 Default argument instantiation for member functions of templates Unknown
2073[basic.stc.dynamic.allocation] open Allocating memory for exception objects Not resolved
2074[temp.dep.type] drafting Type-dependence of local class of function template Not resolved
2075[over.ics.list] CD4 Passing short initializer lists to array reference parameters Unknown
2076[over.best.ics] CD4 List-initialization of arguments for constructor parameters Clang 13
2077[over.ics.ref] drafting Overload resolution and invalid rvalue-reference initialization Not resolved
2078[class.member.lookup] NAD Name lookup of mem-initilizer-id Unknown
2079[dcl.attr.grammar] CD4 [[ appearing in a balanced-token-seq Unknown
2080[class.union] CD5 Example with empty anonymous union member Unknown
2081[dcl.spec.auto] CD5 Deduced return type in redeclaration or specialization of function template Unknown
2082[dcl.fct.default] CD4 Referring to parameters in unevaluated operands of default arguments Clang 11
2083[basic.def.odr] CD5 Incorrect cases of odr-use Partial
2084[class.ctor] CD4 NSDMIs and deleted union default constructors Clang 3.1
2085[basic.def.odr] CD4 Invalid example of adding special member function via default argument Unknown
2086[expr.prim.lambda.capture] drafting Reference odr-use vs implicit capture Not resolved
2087[expr.shift] NAD Left shift of negative value by zero bits Unknown
2088[temp.deduct.partial] CD5 Late tiebreakers in partial ordering Unknown
2089[over.match.oper] drafting Restricting selection of builtin overloaded operators Not resolved
2090[temp.dep.temp] open Dependency via non-dependent base class Not resolved
2091[temp.deduct.type] CD4 Deducing reference non-type template arguments Clang 10
2092[temp.over] CD5 Deduction failure and overload resolution Unknown
2093[except.handle] CD4 Qualification conversion for pointer-to-member handler matching Unknown
2094[class.copy.ctor] C++17 Trivial copy/move constructor for class with volatile member Clang 5
2095[expr.prim.lambda.capture] CD4 Capturing rvalue references to functions by copy Unknown
2096[basic.types] CD4 Constraints on literal unions Duplicate of 2598
2097[expr.prim.lambda] extension Lambdas and noreturn attribute Extension
2098[except.uncaught] CD4 Is uncaught_exceptions() per-thread? Unknown
2099[dcl.array] CD4 Inferring the bound of an array static data member Unknown
2100[temp.dep.constexpr] C++17 Value-dependent address of static data member of class template Clang 12
2101[temp.dep] CD4 Incorrect description of type- and value-dependence Unknown
2102[expr.new] CD7 Constructor checking in new-expression Unknown
2103[basic.def.odr] CD5 Lvalue-to-rvalue conversion is irrelevant in odr-use of a reference Clang 2.7
2104[basic.def.odr] CD4 Internal-linkage constexpr references and ODR requirements Unknown
2105[temp.arg] open When do the arguments for a parameter pack end? Not resolved
2106[temp.arg.type] CD4 Unclear restrictions on use of function-type template arguments Unknown
2107[class.temporary] CD4 Lifetime of temporaries for default arguments in array copying Unknown
2108[over.match.ref] drafting Conversions to non-class prvalues in reference initialization Not resolved
2109[temp.dep.constexpr] CD4 Value dependence underspecified Unknown
2110[over.ics.rank] drafting Overload resolution for base class conversion and reference/non-reference Not resolved
2111[dcl.init.ref] NAD Array temporaries in reference binding Unknown
2112[expr.new] CD5 new auto{x} Unknown
2113[dcl.meaning] CD4 Incompete specification of types for declarators Unknown
2114[diff.cpp11.dcl.decl] CD3 Missing description of incompatibility from aggregate NSDMIs Unknown
2115[stmt.jump] open Order of implicit destruction vs release of automatic storage Not resolved
2116[dcl.init.aggr] C++17 Direct or copy initialization for omitted aggregate initializers Unknown
2117[dcl.constexpr] NAD Explicit specializations and constexpr function templates Unknown
2118[temp.friend] open Stateful metaprogramming via friend injection Not resolved
2119[class.virtual] NAD Disambiguation of multi-level covariant return type Unknown
2120[class] CD4 Array as first non-static data member in standard-layout class Clang 7
2121[expr.prim.lambda.general] CD6 More flexible lambda syntax Unknown
2122[basic.lval] CD4 Glvalues of void type Unknown
2123[stmt.dcl] open Omitted constant initialization of local static variables Not resolved
2124[defns.signature.member.templ] CD4 Signature of constructor template Unknown
2125[class.copy.elision] NAD Copy elision and comma operator Unknown
2126[expr.const] C++20 Lifetime-extended temporaries in constant expressions Clang 12
2127[temp.spec.partial] drafting Partial specialization and nullptr Not resolved
2128[dcl.init.aggr] open Imprecise rule for reference member initializer Not resolved
2129[expr.const] CD4 Non-object prvalues and constant expressions Unknown
2130[expr.new] CD4 Over-aligned types in new-expressions Unknown
2131[dcl.enum] drafting Ambiguity with opaque-enum-declaration Not resolved
2132[class.copy.ctor] NAD Deprecated default generated copy constructors Unknown
2133[conv.fctptr] CD5 Converting std::nullptr_t to bool Unknown
2134[expr.prim.general] NAD Objectless references to non-static member functions Unknown
2135[class.base.init] NAD mem-initializers for virtual bases of abstract classes Unknown
2136[basic.lookup.argdep] NAD Argument-dependent lookup and initializer lists Unknown
2137[dcl.init.list] CD4 List-initialization from object of same type Clang 20
2138[temp.expl.spec] NAD Explicit member specialization vs implicit instantiation Unknown
2139[conv.fpint] NAD Floating-point requirements for integer representation Unknown
2140[conv.lval] CD4 Lvalue-to-rvalue conversion of std::nullptr_t Clang 9
2141[expr.new] CD4 Ambiguity in new-expression with elaborated-type-specifier Clang 17
2142[basic.lookup.argdep] NAD Missing definition of associated classes and namespaces Unknown
2143[temp.dep.type] C++17 Value-dependency via injected-class-name Unknown
2144[dcl.fct.def.general] CD7 Function/variable declaration ambiguity Unknown
2145[dcl.fct.def.general] CD4 Parenthesized declarator in function definition Unknown
2146[intro.execution] CD4 Scalar object vs memory location in definition of “unsequenced” Unknown
2147[temp.deduct.call] CD4 Initializer-list arguments and pack deduction Unknown
2148[basic.start.static] drafting Thread storage duration and order of initialization Not resolved
2149[dcl.init.aggr] CD7 Brace elision and array length deduction Clang 3.1
2150[dcl.init.list] CD3 Initializer list array lifetime Unknown
2151[intro.object] CD4 Exception object is not created Unknown
2152[lex.ext] NAD Can an alternative token be used as a ud-suffix? Unknown
2153[class.mem] CD4 pure-specifier in friend declaration Unknown
2154[class.mem] CD4 Ambiguity of pure-specifier Unknown
2155[namespace.memdef] C++17 Defining classes and enumerations via using-declarations Unknown
2156[dcl.enum] CD4 Definition of enumeration declared by using-declaration Unknown
2157[dcl.type.elab] CD4 Further disambiguation of enumeration elaborated-type-specifier Clang 11
2158[class.dtor] drafting Polymorphic behavior during destruction Not resolved
2159[expr.prim.lambda.capture] NAD Lambda capture and local thread_local variables Unknown
2160[temp.func.order] open Issues with partial ordering Not resolved
2161[temp.explicit] NAD Explicit instantiation declaration and “preceding initialization” Unknown
2162[expr.prim.lambda.capture] CD3 Capturing this by reference Unknown
2163[dcl.constexpr] CD4 Labels in constexpr functions Unknown
2164[basic.scope.hiding] CD5 Name hiding and using-directives Unknown
2165[basic.scope.declarative] CD6 Namespaces, declarative regions, and translation units N/A
2166[expr.const] drafting Unclear meaning of “undefined constexpr function” Not resolved
2167[expr.const] CD4 Non-member references with lifetimes within the current evaluation Unknown
2168[dcl.init.list] review Narrowing conversions and +/- infinity Not resolved
2169[over.ics.list] open Narrowing conversions and overload resolution Not resolved
2170[basic.def.odr] CD5 Unclear definition of odr-use for arrays Clang 9
2171[class.copy.ctor] CD4 Triviality of copy constructor with less-qualified parameter Clang 15
2172[except.handle] drafting Multiple exceptions with one exception object Not resolved
2173[temp.spec.partial] open Partial specialization with non-deduced contexts Not resolved
2174[temp.friend] C++17 Unclear rules for friend definitions in templates Unknown
2175[dcl.ambig.res] CD4 Ambiguity with attribute in conversion operator declaration Unknown
2176[expr.call] CD4 Destroying the returned object when a destructor throws Unknown
2177[expr.new] CD5 Placement operator delete and parameter copies Unknown
2178[temp.param] NAD Substitution of dependent template arguments in default template arguments Unknown
2179[temp.spec.partial.general] drafting Required diagnostic for partial specialization after first use Not resolved
2180[class.copy.assign] CD4 Virtual bases in destructors and defaulted assignment operators Clang 3.0
2181[implimits] C++20 Normative requirements in an informative Annex Unknown
2182[expr.add] drafting Pointer arithmetic in array-like containers Not resolved
2183[except.spec] NAD Problems in description of potential exceptions Unknown
2184[diff.expr] CD4 Missing C compatibility entry for decrement of bool Unknown
2185[basic.fundamental] CD6 Cv-qualified numeric types Unknown
2186[expr.const] C++20 Unclear point that “preceding initialization” must precede Unknown
2187[class.protected] drafting Protected members and access via qualified-id Not resolved
2188[class.mem.general] open empty-declaration grammar ambiguity Not resolved
2189[over.call.object] open Surrogate call template Not resolved
2190[cpp.cond] open Insufficient specification of __has_include Not resolved
2191[except.spec] C++17 Incorrect result for noexcept(typeid(v)) Clang 19
2192[expr.const] open Constant expressions and order-of-eval undefined behavior Not resolved
2193[numeric.limits.members] NAD numeric_limits<int>::radix and digits Unknown
2194[over.match.list] drafting Impossible case in list initialization Not resolved
2195[dcl.type.cv] open Unsolicited reading of trailing volatile members Not resolved
2196[dcl.init] C++17 Zero-initialization with virtual base classes Unknown
2197[class.copy.ctor] C++17 Overload resolution and deleted special member functions Unknown
2198[basic.link] C++17 Linkage of enumerators Unknown
2199[dcl.typedef] CD6 Typedefs and tags Clang 3.8
2200[temp.arg.explicit] NAD Conversions in template argument deduction Unknown
2201[basic.type.qualifier] C++17 Cv-qualification of array types Unknown
2202[temp.inst] drafting When does default argument instantiation occur? Not resolved
2203[class.copy.ctor] drafting Defaulted copy/move constructors and UDCs Not resolved
2204[class.base.init] NAD Naming delegated constructors Unknown
2205[dcl.attr.grammar] C++17 Restrictions on use of alignas Unknown
2206[expr] C++17 Composite type of object and function pointers Unknown
2207[basic.stc.dynamic.allocation] CD5 Alignment of allocation function return value Unknown
2208[class.mem] NAD static_assert-declaration does not declare a member Unknown
2209[except.ctor] NAD Destruction of constructed array elements Unknown
2210[except.ctor] NAD Principal/target constructor confusion Unknown
2211[expr.prim.lambda.capture] C++17 Hiding by lambda captures and parameters Clang 8
2212[dcl.typedef] CD5 Typedef changing linkage after use Unknown
2213[dcl.type.elab] CD6 Forward declaration of partial specializations Clang 2.7
2214[basic.fundamental] C++17 Missing requirement on representation of integer values Unknown
2215[expr.call] C++17 Redundant description of language linkage in function call Unknown
2216[except.spec] NAD Exception specifications in unevaluated contexts Unknown
2217[dcl.constexpr] NAD constexpr constructors for non-literal types Unknown
2218[basic.lookup] C++17 Ambiguity and namespace aliases Unknown
2219[except.handle] drafting Dynamically-unreachable handlers Not resolved
2220[stmt.ranged] C++17 Hiding index variable in range-based for Unknown
2221[dcl.fct.def.default] CD6 Copying volatile objects Unknown
2222[temp.inst] drafting Additional contexts where instantiation is not required Not resolved
2223[dcl.align] drafting Multiple alignas specifiers Not resolved
2224[expr.static.cast] C++17 Member subobjects and base-class casts Unknown
2225[expr.reinterpret.cast] NAD reinterpret_cast to same floating-point type Unknown
2226[expr.cond] CD5 Xvalues vs lvalues in conditional expressions Unknown
2227[class.base.init] CD5 Destructor access and default member initializers Unknown
2228[dcl.ambig.res] review Ambiguity resolution for cast to function type Not resolved
2229[class.bit] CD5 Volatile unnamed bit-fields Clang 7
2230[basic.link] NAD Linkage of extern "C" function in unnamed namespace Unknown
2231[expr.ref] NAD Class member access to static data member template Unknown
2232[dcl.stc] open thread_local anonymous unions Not resolved
2233[dcl.fct.default] CD5 Function parameter packs following default arguments Clang 11
2234[class] CD5 Missing rules for simple-template-id as class-name Unknown
2235[temp.deduct.partial] CD5 Partial ordering and non-dependent types Unknown
2236[temp.alias] drafting When is an alias template specialization dependent? Not resolved
2237[class.ctor] CD5 Can a template-id name a constructor? Unknown
2238[basic.stc.dynamic.allocation] NAD Contradictory alignment requirements for allocation Unknown
2239[expr.delete] NAD Sized deallocation with a trivial destructor Unknown
2240[basic.def.odr] NAD this is not odr-used in a constant expression Unknown
2241[expr.call] CD5 Overload resolution is not invoked with a single function Unknown
2242[basic.def.odr] C++23 ODR violation with constant initialization possibly omitted Unknown
2243[expr.static.cast] drafting Incorrect use of implicit conversion sequence Not resolved
2244[class.protected] open Base class access in aggregate initialization Not resolved
2245[temp.point] drafting Point of instantiation of incomplete class template Not resolved
2246[class.access.base] drafting Access of indirect virtual base class constructors Not resolved
2247[expr.prim.lambda.capture] C++17 Lambda capture and variable argument list Unknown
2248[expr.delete] C++17 Problems with sized delete Unknown
2249[expr.prim.id.unqual] CD5 identifiers and id-expressions Unknown
2250[temp.point] open Implicit instantiation, destruction, and TUs Not resolved
2251[dcl.init.list] C++17 Unreachable enumeration list-initialization Unknown
2252[dcl.init.list] CD7 Enumeration list-initialization from the same type Unknown
2253[class.bit] CD5 Unnamed bit-fields and zero-initialization Unknown
2254[class.mem] CD5 Standard-layout classes and bit-fields Unknown
2255[temp.spec] CD5 Instantiated static data member templates Unknown
2256[basic.life] CD5 Lifetime of trivially-destructible objects Unknown
2257[class.temporary] CD5 Lifetime extension of references vs exceptions Unknown
2258[basic.life] open Storage deallocation during period of destruction Not resolved
2259[dcl.ambig.res] C++17 Unclear context describing ambiguity Unknown
2260[temp.expl.spec] CD5 Explicit specializations of deleted member functions Unknown
2261[temp.friend] extension Explicit instantiation of in-class friend definition Extension
2262[dcl.asm] C++17 Attributes for asm-definition Unknown
2263[temp.inst] drafting Default argument instantiation for friends Not resolved
2264[class.copy.ctor] drafting Memberwise copying with indeterminate value Not resolved
2265[temp.inst] drafting Delayed pack expansion and member redeclarations Not resolved
2266[temp.dep.type] CD5 Has dependent type vs is type-dependent Unknown
2267[dcl.init.ref] CD5 Copy-initialization of temporary in reference direct-initialization No
2268[dcl.constexpr] C++17 Unions with mutable members in constant expressions revisited Unknown
2269[dcl.init.aggr] dup Additional recursive references in aggregate DMIs Unknown
2270[temp.explicit] NAD Non-inline functions and explicit instantiation declarations Unknown
2271[class.ctor] C++17 Aliasing this Unknown
2272[dcl.init.aggr] C++17 Implicit initialization of aggregate members of reference type Unknown
2273[class.ctor] CD5 Inheriting constructors vs implicit default constructor Clang 3.3
2274[stmt.if] NAD Generic lambda capture vs constexpr if Unknown
2275[temp.dep.expr] drafting Type-dependence of function template Not resolved
2276[temp.dep.constexpr] C++17 Dependent noexcept and function type-dependence Unknown
2277[over.ics.rank] CD5 Ambiguity inheriting constructors with default arguments Partial
2278[expr.const] CD5 Copy elision in constant expressions reconsidered Unknown
2279[dcl.attr.grammar] NAD Multiple attribute-specifiers in one attribute-list Unknown
2280[expr.new] C++20 Matching a usual deallocation function with placement new Unknown
2281[expr.new] drafting Consistency of aligned operator delete replacement Not resolved
2282[expr.new] C++20 Consistency with mismatched aligned/non-over-aligned allocation/deallocation functions Unknown
2283[expr.call] CD7 Missing complete type requirements Unknown
2284[expr.call] open Sequencing of braced-init-list arguments Not resolved
2285[dcl.struct.bind] CD5 Issues with structured bindings Clang 4
2286[expr.assign] NAD Assignment evaluation order Unknown
2287[basic.compound] CD5 Pointer-interconvertibility in non-standard-layout unions Unknown
2288[dcl.pre] NAD Contradictory optionality in simple-declaration Unknown
2289[basic.scope.declarative] CD5 Uniqueness of structured binding names Unknown
2290[over.match.funcs] CD5 Unclear specification for overload resolution and deleted special member functions Unknown
2291[over.best.ics] dup Implicit conversion sequences in non-call contexts Unknown
2292[temp.names] CD5 simple-template-id is ambiguous between class-name and type-name Clang 9
2293[class] CD5 Requirements for simple-template-id used as a class-name Unknown
2294[temp.dep.expr] CD5 Dependent auto static data members Unknown
2295[dcl.init.aggr] CD5 Aggregates with deleted defaulted constructors Unknown
2296[temp.deduct] open Are default argument instantiation failures in the “immediate context”? Not resolved
2297[intro.races] open Unclear specification of atomic operations Not resolved
2298[intro.races] open Actions and expression evaluation Not resolved
2299[dcl.constexpr] CD5 constexpr vararg functions Unknown
2300[basic.def.odr] CD5 Lambdas in multiple definitions Unknown
2301[expr.const] open Value-initialization and constexpr constructor evaluation Not resolved
2302[expr.eq] NAD Address comparison between different member subobjects Unknown
2303[temp.deduct.call] CD5 Partial ordering and recursive variadic inheritance Clang 12
2304[over.best.ics] NAD Incomplete type vs overload resolution Clang 2.8
2305[temp.explicit] CD5 Explicit instantiation of constexpr or inline variable template Unknown
2306[temp.friend] NAD Nested friend templates of class templates Unknown
2307[temp.dep.type] CD5 Unclear definition of “equivalent to a nontype template parameter” Unknown
2308[dcl.struct.bind] NAD Structured bindings and lambda capture Unknown
2309[dcl.constexpr] CD5 Restrictions on nested statements within constexpr functions Unknown
2310[conv.ptr] CD5 Type completeness and derived-to-base pointer conversions Partial
2311[over.match.list] open Missed case for guaranteed copy elision Not resolved
2312[dcl.struct.bind] CD6 Structured bindings and mutable Unknown
2313[dcl.struct.bind] CD5 Redeclaration of structured binding reference variables Unknown
2314[dcl.struct.bind] dup Structured bindings and lambda capture Unknown
2315[class.copy.ctor] CD5 What is the “corresponding special member” of a variant member? Unknown
2316[expr.cond] drafting Simplifying class conversions in conditional expressions Not resolved
2317[class.base.init] CD5 Self-referential default member initializers Unknown
2318[temp.deduct.type] CD5 Nondeduced contexts in deduction from a braced-init-list Unknown
2319[over.best.ics] drafting Nested brace initialization from same type Not resolved
2320[stmt.if] extension constexpr if and boolean conversions Extension
2321[expr.cond] CD5 Conditional operator and cv-qualified class prvalues Unknown
2322[temp.deduct] CD5 Substitution failure and lexical order Unknown
2323[basic.types] C++20 Expunge POD Unknown
2324[intro.object] drafting Size of base class subobject Not resolved
2325[intro.object] drafting std::launder and reuse of character buffers Not resolved
2326[temp.deduct.call] dup Type deduction with initializer list containing ambiguous functions Unknown
2327[dcl.init] drafting Copy elision for direct-initialization with a conversion function Not resolved
2328[temp.deduct.type] drafting Unclear presentation style of template argument deduction rules Not resolved
2329[class.copy.assign] open Virtual base classes and generated assignment operators Not resolved
2330[temp.spec] CD5 Missing references to variable templates Unknown
2331[basic.scope.class] CD6 Redundancy in description of class scope N/A
2332[dcl.type.simple] CD5 template-name as simple-type-name vs injected-class-name Unknown
2333[lex.ccon] CD6 Escape sequences in UTF-8 character literals Unknown
2334[intro.object] open Creation of objects by typeid Not resolved
2335[class.static.data] drafting Deduced return types vs member types @@ -13849,1308 +16142,1526 @@

C++ defect report implementation status

2336[except.spec] CD5 Destructor characteristics vs potentially-constructed subobjects Unknown
2337[over.ics.rank] open Incorrect implication of logic ladder for conversion sequence tiebreakers Not resolved
2338[expr.static.cast] CD5 Undefined behavior converting to short enums with fixed underlying types Clang 12
2339[dcl.struct.bind] CD5 Underspecified template arguments in structured bindings Unknown
2340[dcl.struct.bind] open Reference collapsing and structured bindings Not resolved
2341[dcl.pre] CD5 Structured bindings with static storage duration Unknown
2342[expr.reinterpret.cast] CD5 Reference reinterpret_cast and pointer-interconvertibility Unknown
2343[temp.param] C++20 void* non-type template parameters Unknown
2344[stmt.select] NAD Redeclaration of names in init-statements Unknown
2345[stmt.if] CD5 Jumping across initializers in init-statements and conditions Unknown
2346[dcl.fct.default] CD5 Local variables in default arguments Clang 11
2347[expr.call] C++20 Passing short scoped enumerations to ellipsis Unknown
2348[stmt.if] NAD Non-templated constexpr if Unknown
2349[stmt] NAD Class/enumeration names vs conditions Unknown
2350[temp.deduct.partial] NAD Forwarding references and deduction guides Unknown
2351[expr.type.conv] CD5 void{} Clang 20
2352[dcl.init.ref] CD5 Similar types and reference binding Clang 10
2353[basic.def.odr] CD5 Potential results of a member access expression for a static data member Clang 9
2354[basic.align] CD5 Extended alignment and object representation Clang 15
2355[temp.deduct.type] CD6 Deducing noexcept-specifiers Unknown
2356[over.match.funcs] CD5 Base class copy and move constructors should not be inherited Clang 4
2357[basic.lookup.unqual] NAD Lookup in member function declarations Unknown
2358[expr.prim.lambda.capture] CD5 Explicit capture of value Clang 16
2359[dcl.init.aggr] CD5 Unintended copy initialization with designated initializers Unknown
2360[dcl.attr.unused] CD5 [[maybe_unused]] and structured bindings Unknown
2361[csetjmp.syn] open Unclear description of longjmp undefined behavior Not resolved
2362[dcl.fct.def.general] open __func__ should be constexpr Not resolved
2363[class.friend] NAD Opaque enumeration friend declarations Clang 19
2364[expr.const] NAD Constant expressions, aggregate initialization, and modifications Unknown
2365[expr.dynamic.cast] CD5 Confusing specification for dynamic_cast Unknown
2366[basic.start.static] CD5 Can default initialization be constant initialization? Unknown
2367[basic.def.odr] NAD Lambdas in default arguments vs the ODR Unknown
2368[expr.const] CD5 Differences in relational and three-way constant comparisons Unknown
2369[temp.deduct] CD6 Ordering between constraints and substitution Partial
2370[basic.lookup.unqual] CD6 friend declarations of namespace-scope functions No
2371[basic.def] CD5 Use of the English term “attributes” is confusing Unknown
2372[basic.link] CD5 Incorrect matching rules for block-scope extern declarations Unknown
2373[temp.func.order] CD5 Incorrect handling of static member function templates in partial ordering Unknown
2374[dcl.init.list] C++20 Overly permissive specification of enum direct-list-initialization Unknown
2375[class.static.data] NAD Multiple redeclarations of constexpr static data members Unknown
2376[over.match.class.deduct] CD5 Class template argument deduction with array declarator Clang 21
2377[over.match.viable] NAD Explicit copy constructor vs function viability Unknown
2378[expr.prim.lambda.capture] C++20 Inconsistent grammar for reference init-capture of pack Unknown
2379[temp.friend] CD5 Missing prohibition against constexpr in friend declaration Unknown
2380[basic.def.odr] CD5 capture-default makes too many references odr-usable Unknown
2381[expr.type] CD5 Composite pointer type of pointers to plain and noexcept member functions Unknown
2382[expr.new] CD5 Array allocation overhead for non-allocating placement new Unknown
2383[temp.param] NAD Variadic member functions of variadic class templates Unknown
2384[temp.deduct.conv] CD5 Conversion function templates and qualification conversions Unknown
2385[expr.prim.id.qual] CD5 Lookup for conversion-function-ids N/A
2386[dcl.struct.bind] CD5 tuple_size requirements for structured binding Clang 9
2387[basic.link] CD5 Linkage of const-qualified variable template Clang 9
2388[dcl.attr.grammar] NAD Applicability of contract-attribute-specifiers Unknown
2389[dcl.spec.auto] CD6 Agreement of deduced and explicitly-specified variable types Unknown
2390[cpp.cond] CD5 Is the argument of __has_cpp_attribute macro-expanded? Clang 14
2391[temp.variadic] dup Additional template parameters following pack expansion Unknown
2392[expr.const] C++23 new-expression size check and constant evaluation Unknown
2393[expr.pseudo] NAD Pseudo-destructors and object lifetime Unknown
2394[class.default.ctor] CD5 Const-default-constructible for members Clang 15
2395[temp.param] drafting Parameters following a pack expansion Not resolved
2396[expr.prim.id.qual] CD6 Lookup of names in complex conversion-type-ids No
2397[dcl.array] CD6 auto specifier for pointers and references to arrays Clang 17
2398[temp.arg.template] drafting Template template parameter matching and deduction Not resolved
2399[expr.assign] CD5 Unclear referent of “expression” in assignment-expression Unknown
2400[expr.const] CD5 Constexpr virtual functions and temporary objects Unknown
2401[temp.arg.nontype] C++20 Array decay vs prohibition of subobject non-type arguments Unknown
2402[lex.ccon] CD6 When is the restriction to a single c-char in a Unicode literal enforced? Unknown
2403[class.base.init] drafting Temporary materialization and base/member initialization Not resolved
2404[class.mem] CD5 [[no_unique_address]] and allocation order Unknown
2405[temp.dep.expr] CD6 Additional type-dependent expressions Unknown
2406[dcl.attr.fallthrough] CD5 [[fallthrough]] attribute and iteration statements Clang 5
2407[diff] C++23 Missing entry in Annex C for defaulted comparison operators Unknown
2408[dcl.init.aggr] NAD Temporaries and previously-initialized elements in aggregate initialization Unknown
2409[temp.expl.spec] drafting Explicit specializations of constexpr static data members Not resolved
2410[dcl.constexpr] C++23 Implicit calls of immediate functions Unknown
2411[temp.type] C++20 Comparison of pointers to members in template non-type arguments Unknown
2412[dcl.spec.auto] review SFINAE vs undeduced placeholder type Not resolved
2413[temp.res] CD6 typename in conversion-function-ids Unknown
2414[class.compare.default] C++20 Unclear results if both member and friend operator<=> are declared Unknown
2415[class.copy.assign] NAD using-declarations vs copy assignment operators Unknown
2416[temp.expl.spec] C++20 Explicit specializations vs constexpr and consteval Unknown
2417[except.spec] open Explicit instantiation and exception specifications Not resolved
2418[expr.const] CD5 Missing cases in definition of “usable in constant expressions” Unknown
2419[expr.add] C++20 Loss of generality treating pointers to objects as one-element arrays Unknown
2420[except.spec] dup Exception specifications in explicit instantiation Unknown
2421[temp.explicit] drafting Explicit instantiation of constrained member functions Not resolved
2422[temp.deduct.guide] C++20 Incorrect grammar for deduction-guide Unknown
2423[basic.pre] NAD Typedefs, names, and entities Unknown
2424[dcl.constexpr] C++20 constexpr initialization requirements for variant members Unknown
2425[over.match.class.deduct] open Confusing wording for deduction from a type Not resolved
2426[except.ctor] C++20 Reference to destructor that cannot be invoked Unknown
2427[expr.assign] C++20 Deprecation of volatile operands and unevaluated contexts Unknown
2428[temp.concept] C++23 Deprecating a concept Clang 19
2429[stmt.dcl] C++20 Initialization of thread_local variables referenced by lambdas Unknown
2430[class.mem] C++20 Completeness of return and parameter types of member functions Clang 2.7
2431[basic.exec] C++20 Full-expressions and temporaries bound to references Unknown
2432[class.spaceship] C++20 Return types for defaulted <=> Unknown
2433[basic.def.odr] C++20 Variable templates in the ODR Unknown
2434[class.temporary] review Mandatory copy elision vs non-class objects Not resolved
2435[temp.spec] open Alias template specializations Not resolved
2436[dcl.fct.def.coroutine] C++20 Copy semantics of coroutine parameters Unknown
2437[class.spaceship] C++20 Conversion of std::strong_ordering in a defaulted operator<=> Unknown
2438[conv.qual] open Problems in the specification of qualification conversions Not resolved
2439[expr.const] C++20 Undefined term in definition of “usable in constant expressions” Unknown
2440[expr.const] C++23 Allocation in core constant expressions Unknown
2441[dcl.inline] C++20 Inline function parameters Unknown
2442[over.match.viable] C++20 Incorrect requirement for default arguments Unknown
2443[module.interface] C++23 Meaningless template exports Unknown
2444[basic.start.dynamic] drafting Constant expressions in initialization odr-use Not resolved
2445[temp.func.order] C++20 Partial ordering with rewritten candidates Clang 19
2446[temp.dep.expr] C++20 Questionable type-dependency of concept-ids Unknown
2447[dcl.spec.auto] C++20 Unintended description of abbreviated function templates Unknown
2448[basic.fundamental] CD6 Cv-qualification of arithmetic types and deprecation of volatile Unknown
2449[expr.unary.op] extension Thunks as an implementation technique for pointers to virtual functions Extension
2450[temp.names] CD7 braced-init-list as a template-argument Clang 18
2451[dcl.fct.def.coroutine] C++23 promise.unhandled_exception() and final suspend point Unknown
2452[stmt.return.coroutine] CD6 Flowing off the end of a coroutine Unknown
2453[dcl.spec.auto.general] NAD Deduced return types and coroutine lambdas Unknown
2454[expr.await] NAD Tail recursion and coroutine symmetric transfer Unknown
2455[lex.phases] CD6 Concatenation of string literals vs translation phases 5 and 6 Unknown
2456[expr.const] open Viable user-defined conversions in converted constant expressions Not resolved
2457[temp.dep.type] CD6 Unexpanded parameter packs don't make a function type dependent Unknown
2458[expr.ref] CD6 Value category of expressions denoting non-static member functions Unknown
2459[temp.arg.nontype] CD7 Template parameter initialization Clang 18
2460[dcl.link] CD6 C language linkage and constrained non-template friends Unknown
2461[temp.res] CD6 Diagnosing non-bool type constraints Unknown
2462[temp.res.general] open Problems with the omission of the typename keyword Not resolved
2463[class.prop] open Trivial copyability and unions with non-trivial members Not resolved
2464[ptr.launder] CD6 Constexpr launder and unions Unknown
2465[dcl.fct.def.coroutine] CD6 Coroutine parameters passed to a promise constructor Unknown
2466[expr.await] CD6 co_await should be a single evaluation Unknown
2467[over.match.class.deduct] drafting CTAD for alias templates and the deducible check Not resolved
2468[temp.res.general] open Omission of the typename keyword in a member template parameter list Not resolved
2469[intro.object] drafting Implicit object creation vs constant expressions Not resolved
2470[intro.object] CD6 Multiple array objects providing storage for one object Unknown
2471[over.match.class.deduct] drafting Nested class template argument deduction Not resolved
2472[expr.await] NAD Value categories in await-expressions Unknown
2473[expr.prim.id.dtor] open Parentheses in pseudo-destructor calls Not resolved
2474[expr.delete] CD6 Cv-qualification and deletion Unknown
2475[basic.fundamental] C++23 Object declarations of type cv void Unknown
2476[dcl.spec.auto.general] CD7 placeholder-type-specifiers and function declarators Unknown
2477[class.copy.ctor] CD6 Defaulted vs deleted copy constructors/assignment operators Unknown
2478[temp.expl.spec] C++23 Properties of explicit specializations of implicitly-instantiated class templates Unknown
2479[basic.start.main] CD6 Missing specifications for consteval and constinit Unknown
2480[basic.lookup.general] drafting Lookup for enumerators in modules Not resolved
2481[dcl.init.ref] CD6 Cv-qualification of temporary to which a reference is bound Unknown
2482[bit.cast] CD6 bit_cast and indeterminate values Unknown
2483[dcl.link] C++23 Language linkage of static member functions Unknown
2484[conv.prom] CD6 char8_t and char16_t in integral promotions Unknown
2485[conv.prom] CD7 Bit-fields in integral promotions Unknown
2486[expr.call] CD6 Call to noexcept function via noexcept(false) pointer/lvalue Clang 4 (C++17 onwards)
2487[temp.dep.expr] drafting Type dependence of function-style cast to incomplete array type Not resolved
2488[basic.scope.scope] open Overloading virtual functions and functions with trailing requires-clauses Not resolved
2489[intro.object] C++23 Storage provided by array of char Unknown
2490[expr.const] CD6 Restrictions on destruction in constant expressions Unknown
2491[module.interface] CD6 Export of typedef after its first declaration Unknown
2492[over.ics.list] open Comparing user-defined conversion sequences in list-initialization Not resolved
2493[dcl.spec.auto.general] dup auto as a conversion-type-id Unknown
2494[basic.def.odr] CD6 Multiple definitions of non-odr-used entities Unknown
2495[stmt.return] open Glvalue result of a function call Not resolved
2496[class.virtual] CD6 ref-qualifiers and virtual overriding Clang 21
2497[temp.point] drafting Points of instantiation for constexpr function templates Not resolved
2498[temp.deduct.general] open Partial specialization failure and the immediate context Not resolved
2499[basic.compound] CD6 Inconsistency in definition of pointer-interconvertibility Unknown
2500[expr.static.cast] extension noexcept(false) functions and noexcept expressions Extension
2501[temp.explicit] drafting Explicit instantiation and trailing requires-clauses Not resolved
2502[basic.scope.block] CD6 Unintended declaration conflicts in nested statement scopes Unknown
2503[expr.prim.id] drafting Unclear relationship among name, qualified name, and unqualified name Not resolved
2504[class.inhctor.init] CD7 Inheriting constructors from virtual base classes No
2505[namespace.unnamed] drafting Nested unnamed namespace of inline unnamed namespace Not resolved
2506[dcl.struct.bind] CD6 Structured bindings and array cv-qualifiers Unknown
2507[over.oper.general] CD6 Default arguments for operator[] Unknown
2508[temp.local] C++23 Restrictions on uses of template parameter names Unknown
2509[expr.prim.lambda.general] CD6 decl-specifier-seq in lambda-specifiers Unknown
2510[class.mem.general] NAD noexcept-specifier of friend function vs class completeness Unknown
2511[class.bit] CD6 cv-qualified bit-fields Unknown
2512[expr.typeid] NAD typeid and incomplete class types Clang 2.7
2513[class.conv.fct] open Ambiguity with requires-clause and operator-function-id Not resolved
2514[basic.life] open Modifying const subobjects Not resolved
2515[expr.call] open Result of a function call Not resolved
2516[basic.scope.pdecl] C++23 Locus of enum-specifier or opaque-enum-declaration Clang 3.0
2517[expr.prim.req.nested] C++23 Useless restriction on use of parameter in constraint-expression Clang 21
2518[intro.compliance.general] C++23 Conformance requirements and #error/#warning Clang 17
2519[basic.types.general] CD7 Object representation of a bit-field Unknown
2520[defns.signature.templ] C++23 Template signature and default template arguments Unknown
2521[over.literal] C++23 User-defined literals and reserved identifiers Clang 17
2522[cpp.concat] open Removing placemarker tokens and retention of whitespace Not resolved
2523[expr.const] C++23 Undefined behavior via omitted destructor call in constant expressions Unknown
2524[over.ics.rank] NAD Distinguishing user-defined conversion sequences by ref-qualifier Unknown
2525[over.best.ics.general] open Incorrect definition of implicit conversion sequence Not resolved
2526[expr.rel] C++23 Relational comparison of void* pointers Unknown
2527[dcl.attr.nouniqueaddr] NAD Non-class potentially-overlapping objects Unknown
2528[expr.arith.conv] C++23 Three-way comparison and the usual arithmetic conversions Unknown
2529[expr.const] C++23 Constant destruction of constexpr references Unknown
2530[basic.def.odr] C++23 Multiple definitions of enumerators Unknown
2531[dcl.constexpr] CD7 Static data members redeclared as constexpr Unknown
2532[expr.new] open Kind of pointer value returned by new T[0] Not resolved
2533[basic.stc] CD7 Storage duration of implicitly created objects Unknown
2534[expr.ref] CD6 Value category of pseudo-destructor expression Unknown
2535[expr.ref] CD6 Type punning in class member access Unknown
2536[expr.const] open Partially initialized variables during constant initialization Not resolved
2537[dcl.fct] drafting Overbroad grammar for parameter-declaration Not resolved
2538[dcl.attr.grammar] C++23 Can standard attributes be syntactically ignored? Unknown
2539[class.spaceship] C++23 Three-way comparison requiring strong ordering for floating-point types Unknown
2540[lex.ccon] CD6 Unspecified interpretation of numeric-escape-sequence Unknown
2541[module.unit] open Linkage specifications, module purview, and module attachment Not resolved
2542[expr.prim.lambda.closure] CD7 Is a closure type a structural type? Unknown
2543[dcl.constinit] C++23 constinit and optimized dynamic initialization Unknown
2544[basic.compound] open Address of past-the-end of a potentially-overlapping subobject Not resolved
2545[expr.const] open Transparently replacing objects in constant expressions Not resolved
2546[class.compare.secondary] CD7 Defaulted secondary comparison operators defined as deleted Unknown
2547[dcl.fct.def.default] CD7 Defaulted comparison operator function for non-classes Clang 20
2548[expr.add] NAD Array prvalues and additive operators Unknown
2549[expr.prim.id.qual] CD7 Implicitly moving the operand of a throw-expression in unevaluated contexts Unknown
2550[dcl.ref] CD7 Type "reference to cv void" outside of a declarator Unknown
2551[basic.life] review "Refers to allocated storage" has no meaning Not resolved
2552[expr.const] CD7 Constant evaluation of non-defining variable declarations Unknown
2553[dcl.fct] review Restrictions on explicit object member functions @@ -15161,6 +17672,7 @@

C++ defect report implementation status

2554[class.virtual] review Overriding virtual functions, also with explicit object parameters @@ -15171,66 +17683,77 @@

C++ defect report implementation status

2555[namespace.udecl] tentatively ready Ineffective redeclaration prevention for using-declarators Not resolved
2556[stmt.return.coroutine] CD7 Unusable promise::return_void Unknown
2557[expr.ref] review Class member access referring to an unrelated class Not resolved
2558[expr.const] C++23 Uninitialized subobjects as a result of an immediate invocation Unknown
2559[expr.const] open Defaulted consteval functions Not resolved
2560[expr.prim.req.general] CD7 Parameter type determination in a requirement-parameter-list Unknown
2561[expr.prim.lambda.closure] CD7 Conversion to function pointer for lambda with explicit object parameter No
2562[dcl.fct.def.coroutine] open Exceptions thrown during coroutine startup Not resolved
2563[dcl.fct.def.coroutine] review Initialization of coroutine result object Not resolved
2564[over.call.object] drafting Conversion to function pointer with an explicit object parameter Not resolved
2565[expr.prim.req.general] open Invalid types in the parameter-declaration-clause of a requires-expression @@ -15241,1231 +17764,1435 @@

C++ defect report implementation status

2566[expr.new] review Matching deallocation for uncaught exception Not resolved
2567[class.member.lookup] NAD Operator lookup ambiguity Unknown
2568[class.compare.default] CD7 Access checking during synthesis of defaulted comparison operator Unknown
2569[expr.prim.id.unqual] CD6 Use of decltype(capture) in a lambda's parameter-declaration-clause Unknown
2570[dcl.fct.def.default] CD7 Clarify constexpr for defaulted functions Unknown
2571[expr.sub] CD6 Evaluation order for subscripting Unknown
2572[over.over] review Address of overloaded function with no target Not resolved
2573[lex.phases] CD7 Undefined behavior when splicing results in a universal-character-name Unknown
2574[lex.pptoken] CD7 Undefined behavior when lexing unmatched quotes Unknown
2575[cpp.cond] open Undefined behavior when macro-replacing "defined" operator Not resolved
2576[cpp.include] open Undefined behavior with macro-expanded #include directives Not resolved
2577[cpp.replace.general] open Undefined behavior for preprocessing directives in macro arguments Not resolved
2578[cpp.stringize] CD7 Undefined behavior when creating an invalid string literal via stringizing Unknown
2579[cpp.concat] CD7 Undefined behavior when token pasting does not create a preprocessing token Unknown
2580[cpp.line] CD7 Undefined behavior with #line Unknown
2581[cpp.predefined] open Undefined behavior for predefined macros Not resolved
2582[class.member.lookup] CD6 Differing member lookup from nested classes Unknown
2583[class.mem.general] C++23 Common initial sequence should consider over-alignment Clang 19
2584[temp.over.link] open Equivalent types in function template declarations Not resolved
2585[dcl.fct.def.coroutine] CD6 Name lookup for coroutine allocation Unknown
2586[class.compare.default] CD6 Explicit object parameter for assignment and comparison Clang 20
2587[intro.races] review Visible side effects and initial value of an object Not resolved
2588[class.friend] CD7 friend declarations and module linkage Unknown
2589[temp.constr.atomic] review Context of access checks during constraint satisfaction checking Not resolved
2590[dcl.enum] C++23 Underlying type should determine size and alignment requirements of an enum Unknown
2591[class.union.general] CD7 Implicit change of active union member for anonymous union in union Unknown
2592[expr.new] open Missing definition for placement allocation/deallocation function Not resolved
2593[expr.mptr.oper] review Insufficient base class restriction for pointer-to-member expression Not resolved
2594[basic.start.main] CD6 Disallowing a global function template main Unknown
2595[special] CD7 "More constrained" for eligible special member functions Unknown
2596[temp.inst] drafting Instantiation of constrained non-template friends Not resolved
2597[module.unit] CD6 Replaceable allocation and deallocation functions in the global module Unknown
2598[basic.types.general] C++23 Unions should not require a non-static data member of literal type Clang 18
2599[expr.call] C++23 What does initializing a parameter include? Unknown
2600[temp.dep.expr] CD7 Type dependency of placeholder types Unknown
2601[except.ctor] C++23 Tracking of created and destroyed subobjects Unknown
2602[dcl.constexpr] C++23 consteval defaulted functions Unknown
2603[temp.over.link] C++23 Holistic functional equivalence for function templates Unknown
2604[temp.expl.spec] C++23 Attributes for an explicit specialization Unknown
2605[class.prop] C++23 Implicit-lifetime aggregates Unknown
2606[expr.static.cast] CD6 static_cast from "pointer to void" does not handle similar types Unknown
2607[module.interface] drafting Visibility of enumerator names Not resolved
2608[temp.arg.explicit] CD6 Omitting an empty template argument list Unknown
2609[expr.sizeof] open Padding in class types Not resolved
2610[dcl.init.aggr] C++23 Indirect private base classes in aggregates Unknown
2611[temp.variadic] C++23 Missing parentheses in expansion of fold-expression could cause syntactic reinterpretation Unknown
2612[dcl.init.general] C++23 Incorrect comment in example Unknown
2613[dcl.fct.def.coroutine] C++23 Incomplete definition of resumer Unknown
2614[expr.ref] C++23 Unspecified results for class member access Unknown
2615[cpp.cond] C++23 Missing __has_cpp_attribute(assume) Unknown
2616[stmt] C++23 Imprecise restrictions on break and continue Unknown
2617[temp.param] review Default template arguments for template members of non-template classes Not resolved
2618[temp.deduct.general] C++23 Substitution during deduction should exclude exception specifications Unknown
2619[dcl.init.aggr] C++23 Kind of initialization for a designated-initializer-list Unknown
2620[dcl.ambig.res] C++23 Nonsensical disambiguation rule Unknown
2621[enum.udecl] C++23 Kind of lookup for using enum declarations Superseded by 2877
2622[implimits] C++23 Compounding types from function and pointer-to-member types Unknown
2623[expr.new] drafting Invoking destroying operator delete for constructor failure Not resolved
2624[expr.delete] C++23 Array delete expression with no array cookie Unknown
2625[basic.life] C++23 Deletion of pointer to out-of-lifetime object Unknown
2626[expr.unary.op] C++23 Rephrase ones' complement using base-2 representation Unknown
2627[dcl.init.list] C++23 Bit-fields and narrowing conversions Clang 20
2628[over.match.class.deduct] CD7 Implicit deduction guides should propagate constraints Clang 20
2629[stmt.switch] C++23 Variables of floating-point type as switch conditions Unknown
2630[class.mem.general] C++23 Syntactic specification of class completeness Clang 9
2631[expr.const] C++23 Immediate function evaluations in default arguments Clang 16
2632[intro.defs] drafting 'user-declared' is not defined Not resolved
2633[expr.const] open typeid of constexpr-unknown dynamic type Not resolved
2634[dcl.type.elab] CD7 Avoid circularity in specification of scope for friend class declarations Unknown
2635[dcl.pre] C++23 Constrained structured bindings Clang 16
2636[ub] C++23 Update Annex E based on Unicode 15.0 UAX #31 N/A
2637[class.pre] CD7 Injected-class-name as a simple-template-id Unknown
2638[dcl.init.list] CD7 Improve the example for initializing by initializer list Unknown
2639[lex.phases] C++23 new-lines after phase 1 Unknown
2640[lex.charset] C++23 Allow more characters in an n-char sequence Clang 16
2641[lex.literal] C++23 Redundant specification of value category of literals Unknown
2642[class.member.lookup] C++23 Inconsistent use of T and C N/A
2643[basic.types.general] C++23 Completing a pointer to array of unknown bound Unknown
2644[expr.prim.lambda.capture] C++23 Incorrect comment in example Clang 8
2645[expr.call] C++23 Unused term "default argument promotions" Unknown
2646[dcl.fct.def.default] C++23 Defaulted special member functions Unknown
2647[expr.const] C++23 Fix for "needed for constant evaluation" Unknown
2648[over.call] C++23 Correspondence of surrogate call function and conversion function Unknown
2649[over.call.object] C++23 Incorrect note about implicit conversion sequence Unknown
2650[temp.deduct.general] C++23 Incorrect example for ill-formed non-type template arguments Clang 17
2651[temp.deduct.conv] C++23 Conversion function templates and "noexcept" Unknown
2652[cpp.predefined] C++23 Overbroad definition of __STDCPP_BFLOAT16_T__ Unknown
2653[dcl.fct] C++23 Can an explicit object parameter have a default argument? Clang 18
2654[expr.assign] C++23 Un-deprecation of compound volatile assignments Clang 16
2655[temp.inst] NAD Instantiation of default arguments in lambda-expressions Unknown
2656[expr.const] drafting Converting consteval lambda to function pointer in non-immediate context Not resolved
2657[dcl.init.ref] CD7 Cv-qualification adjustment when binding reference to temporary Unknown
2658[expr.const] C++23 Trivial copying of unions in core constant expressions Unknown
2659[cpp.predefined] C++23 Missing feature-test macro for lifetime extension in range-for loop Unknown
2660[expr.call] open Confusing term "this parameter" Not resolved
2661[class.mem.general] CD7 Missing disambiguation rule for pure-specifier vs. brace-or-equal-initializer Unknown
2662[class.access.general] C++23 Example for member access control vs. overload resolution Unknown
2663[namespace.udecl] CD7 Example for member redeclarations with using-declarations Unknown
2664[over.match.class.deduct] C++23 Deduction failure in CTAD for alias templates Unknown
2665[basic.life] NAD Replacing a subobject with a complete object Unknown
2666[class.temporary] open Lifetime extension through static_cast Not resolved
2667[cpp.import] C++23 Named module imports do not import macros Unknown
2668[expr.await] CD7 co_await in a lambda-expression Unknown
2669[class.base.init] open Lifetime extension for aggregate initialization Not resolved
2670[basic.link] open Programs and translation units Not resolved
2671[dcl.meaning.general] open friend named by a template-id Not resolved
2672[temp.deduct.general] CD7 Lambda body SFINAE is still required, contrary to intent and note Clang 18
2673[over.match.oper] C++23 User-declared spaceship vs. built-in operators Unknown
2674[class.ctor.general] C++23 Prohibit explicit object parameters for constructors Unknown
2675[class.union.general] open start_lifetime_as, placement-new, and active union members Not resolved
2676[basic.life] open Replacing a complete object having base subobjects Not resolved
2677[basic.life] review Replacing union subobjects Not resolved
2678[basic.def.odr] C++23 std::source_location::current is unimplementable Unknown
2679[over.best.ics.general] open Implicit conversion sequence with a null pointer constant Not resolved
2680[over.match.class.deduct] open Class template argument deduction for aggregates with designated initializers Not resolved
2681[over.match.class.deduct] C++23 Deducing member array type from string literal Clang 17
2682[temp.pre] C++23 Templated function vs. function template Unknown
2683[dcl.fct.default] CD7 Default arguments for member functions of templated nested classes Unknown
2684[basic.start.dynamic] open thread_local dynamic initialization Not resolved
2685[over.match.class.deduct] C++23 Aggregate CTAD, string, and brace elision Unknown
2686[temp.constr.constr] open Pack expansion into a non-pack parameter of a concept Not resolved
2687[over.match.call.general] C++23 Calling an explicit object member function via an address-of-overload-set Clang 18
2688[expr.call] open Calling explicit object member functions Not resolved
2689[basic.fundamental] CD7 Are cv-qualified std::nullptr_t fundamental types? Unknown
2690[class.copy.assign] C++23 Semantics of defaulted move assignment operator for unions Unknown
2691[lex.ccon] C++23 hexadecimal-escape-sequence is too greedy Unknown
2692[over.match.call.general] C++23 Static and explicit object member functions with the same parameter-type-lists Clang 19
2693[cpp.line] open Escape sequences for the string-literal of #line Not resolved
2694[cpp.pragma.op] open string-literals of the _Pragma operator Not resolved
2695[dcl.attr.grammar] C++23 Semantic ignorability of attributes Unknown
2696[expr.rel] dup Relational comparisons of pointers to void Unknown
2697[temp.deduct.guide] CD7 Deduction guides using abbreviated function syntax Unknown
2698[lex.icon] CD7 Using extended integer types with z suffix Unknown
2699[expr.throw] CD7 Inconsistency of throw-expression specification Unknown
2700[intro.compliance.general] CD7 #error disallows existing implementation practice Unknown
2701[dcl.fct.default] open Default arguments in multiple scopes / inheritance of array bounds in the same scope Not resolved
2702[expr.const] open Constant destruction of reference members Not resolved
2703[class.spaceship] CD7 Three-way comparison requiring strong ordering for floating-point types, take 2 Unknown
2704[dcl.init.ref] open Clarify meaning of "bind directly" Not resolved
2705[expr.ref] open Accessing ambiguous subobjects Not resolved
2706[basic.link] open Repeated structured binding declarations Not resolved
2707[temp.deduct.guide] CD7 Deduction guides cannot have a trailing requires-clause Clang 20
2708[dcl.init.general] CD7 Parenthesized initialization of arrays Unknown
2709[dcl.init.general] NAD Parenthesized initialization of reference-to-aggregate Unknown
2710[expr.const] CD7 Loops in constant expressions Unknown
2711[expr.throw] CD7 Source for copy-initializing the exception object Unknown
2712[over.match.oper] CD7 Simplify restrictions on built-in assignment operator candidates Unknown
2713[dcl.init.list] CD7 Initialization of reference-to-aggregate from designated initializer list Unknown
2714[over.match.class.deduct] CD7 Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor Unknown
2715[expr.call] CD7 "calling function" for parameter initialization may not exist Unknown
2716[class.conv.fct] CD7 Rule about self-or-base conversion is normatively redundant Unknown
2717[temp.variadic] CD7 Pack expansion for alignment-specifier Unknown
2718[expr.static.cast] CD7 Type completeness for derived-to-base conversions Clang 2.7
2719[basic.align] CD7 Creating objects in misaligned storage Unknown
2720[temp.res.general] CD7 Template validity rules for templated entities and alias templates Unknown
2721[basic.life] CD7 When exactly is storage reused? Unknown
2722[expr.unary.noexcept] CD7 Temporary materialization conversion for noexcept operator Unknown
2723[basic.fundamental] CD7 Range of representable values for floating-point types Unknown
2724[expr.shift] CD7 Clarify rounding for arithmetic right shift Unknown
2725[expr.ref] CD7 Overload resolution for non-call of class member access Unknown
2726[lex.digraph] review Alternative tokens appearing as attribute-tokens Not resolved
2727[module.import] open Importing header units synthesized from source files Not resolved
2728[expr.delete] CD7 Evaluation of conversions in a delete-expression Unknown
2729[expr.new] CD7 Meaning of new-type-id Unknown
2730[over.match.oper] open Comparison templates on enumeration types Not resolved
2731[over.ics.user] open List-initialization sequence with a user-defined conversion Not resolved
2732[module.import] CD7 Can importable headers react to preprocessor state from point of import? Unknown
2733[dcl.attr.unused] CD7 Applying [[maybe_unused]] to a label Unknown
2734[expr.const] open Immediate forward-declared function templates Not resolved
2735[over.match.best] open List-initialization and conversions in overload resolution Not resolved
2736[class.prop] open Standard layout class with empty base class also in first member Not resolved
2737[expr.prim.lambda.capture] review Temporary lifetime extension for reference init-captures Not resolved
2738[expr.prim.id.unqual] review "denotes a destructor" is missing specification Not resolved
2739[expr.prim.req.nested] open Nested requirement not a constant expression Not resolved
2740[expr.const] open Too many objects have constexpr-unknown type Not resolved
2741[over.ics.list] open Implicit conversion sequence from empty list to array of unknown bound Not resolved
2742[dcl.init.list] drafting Guaranteed copy elision for brace-initialization from prvalue Not resolved
2743[class.copy.ctor] open Copying non-trivial objects nested within a union Not resolved
2744[intro.object] open Multiple objects of the same type at the same address Not resolved
2745[basic.def.odr] CD7 Dependent odr-use in generic lambdas Unknown
2746[temp.res.general] CD7 Checking of default template arguments Unknown
2747[lex.phases] CD7 Cannot depend on an already-deleted splice Unknown
2748[expr.ref] CD7 Accessing static data members via null pointer Unknown
2749[expr.rel] CD7 Treatment of "pointer to void" for relational comparisons Clang 20
2750[expr.const] CD7 construct_at without constructor call Unknown
2751[stmt.dcl] NAD Order of destruction for parameters for operator functions Unknown
2752[lex.fcon] open Excess-precision floating-point literals Not resolved
2753[intro.object] CD7 Storage reuse for string literal objects and backing arrays Unknown
2754[dcl.fct.def.coroutine] CD7 Using *this in explicit object member functions that are coroutines Unknown
2755[expr.const] CD7 Incorrect wording applied by P2738R1 Unknown
2756[class.init] review Completion of initialization by delegating constructor Not resolved
2757[class.cdtor] review Deleting or deallocating storage of an object during its construction Not resolved
2758[expr.delete] CD7 What is "access and ambiguity control"? Unknown
2759[class.mem.general] CD7 [[no_unique_address] and common initial sequence Clang 19
2760[expr.const] CD7 Defaulted constructor that is an immediate function Unknown
2761[class.dtor] CD7 Implicitly invoking the deleted destructor of an anonymous union member Unknown
2762[over.match.funcs.general] CD7 Type of implicit object parameter Unknown
2763[expr.const] CD7 Ignorability of [[noreturn]] during constant evaluation Unknown
2764[basic.scope.scope] CD7 Use of placeholders affecting name mangling Unknown
2765[intro.object] open Address comparisons between potentially non-unique objects during constant evaluation Not resolved
2766[lex.string] openRepeated evaluation of a string-literal may yield different -objectsRepeated evaluation of a string-literal may yield different objects Not resolved
2767[class.union.anon] open Non-defining declarations of anonymous unions Not resolved
2768[expr.assign] CD7 Assignment to enumeration variable with a braced-init-list Unknown
2769[temp.deduct.general] open Substitution into template parameters and default template arguments should be interleaved Not resolved
2770[temp.deduct.general] open Trailing requires-clause can refer to function parameters before they are substituted into @@ -16476,462 +19203,539 @@

C++ defect report implementation status

2771[class.mfct.non.static] CD7 Transformation for unqualified-ids in address operator Clang 18
2772[diff.cpp03.dcl.dcl] CD7 Missing Annex C entry for linkage effects of linkage-specification Unknown
2773[class.union.anon] open Naming anonymous union members as class members Not resolved
2774[temp.dep.constexpr] open Value-dependence of requires-expressions Not resolved
2775[except.throw] CD7 Unclear argument type for copy of exception object Unknown
2776[intro.compliance.general] open Substitution failure and implementation limits Not resolved
2777[temp.param] CD7 Type of id-expression denoting a template parameter object Unknown
2778[expr.const] review Trivial destructor does not imply constant destruction Not resolved
2779[lex.charset] open Restrictions on the ordinary literal encoding Not resolved
2780[expr.reinterpret.cast] CD7 reinterpret_cast to reference to function types Unknown
2781[basic.def.odr] open Unclear recursion in the one-definition rule Not resolved
2782[basic.def.odr] open Treatment of closure types in the one-definition rule Not resolved
2783[module.global.frag] CD7 Handling of deduction guides in global-module-fragment Unknown
2784[support.types.layout] open Unclear definition of member-designator for offsetof Not resolved
2785[temp.dep.expr] CD7 Type-dependence of requires-expression Unknown
2786[expr.eq] open Comparing pointers to complete objects Not resolved
2787[special] open Kind of explicit object copy/move assignment function Not resolved
2788[basic.scope.scope] open Correspondence and redeclarations Not resolved
2789[over.match.best.general] CD7 Overload resolution with implicit and explicit object member functions Clang 18
2790[over.ics.list] open Aggregate initialization and user-defined conversion sequence Not resolved
2791[stmt.return] CD7 Unclear phrasing about "returning to the caller" Unknown
2792[expr.unary.noexcept] CD7 Clean up specification of noexcept operator Unknown
2793[basic.scope.block] CD7 Block-scope declaration conflicting with parameter name Unknown
2794[temp.alias] open Uniqueness of lambdas in alias templates Not resolved
2795[intro.object] CD7 Overlapping empty subobjects with different cv-qualification Unknown
2796[expr.rel] CD7 Function pointer conversions for relational operators Unknown
2797[over.match.oper] review Meaning of "corresponds" for rewritten operator candidates Not resolved
2798[expr.const] CD7 Manifestly constant evaluation of the static_assert message Clang 17
2799[class.default.ctor] drafting Inheriting default constructors Not resolved
2800[expr.const] review Instantiating constexpr variables for potential constant evaluation Not resolved
2801[dcl.init.ref] CD7 Reference binding with reference-related types Unknown
2802[dcl.fct] open Constrained auto and redeclaration with non-abbreviated syntax Not resolved
2803[over.ics.ref] CD7 Overload resolution for reference binding of similar types Unknown
2804[over.match.oper] open Lookup for determining rewrite targets Not resolved
2805[expr.delete] open Underspecified selection of deallocation function Not resolved
2806[temp.res.general] CD7 Make a type-requirement a type-only context Unknown
2807[class.dtor] CD7 Destructors declared consteval Unknown
2808[temp.inst] review Explicit specialization of defaulted special member function Not resolved
2809[dcl.fct.def.default] CD7 An implicit definition does not redeclare a function Unknown
2810[temp.res.general] CD7 Requiring the absence of diagnostics for templates Unknown
2811[basic.start.main] CD7 Clarify "use" of main Clang 3.5
2812[expr.new] open Allocation with explicit alignment Not resolved
2813[expr.ref] CD7 Class member access with prvalues Clang 20
2814[expr.static.cast] NAD Alignment requirement of incomplete class type Unknown
2815[over.ics.rank] CD7 Overload resolution for references/pointers to noexcept functions Unknown
2816[intro.progress] review Unclear phrasing "may assume ... eventually" Not resolved
2817[expr.sizeof] open sizeof(abstract class) is underspecified Not resolved
2818[lex.name] CD7 Use of predefined reserved identifiers Unknown
2819[expr.const] CD7 Cast from null pointer value in a constant expression Clang 19 (C++26 onwards)
2820[dcl.init.general] CD7 Value-initialization and default constructors Unknown
2821[basic.life] review Lifetime, zero-initialization, and dynamic initialization Not resolved
2822[basic.stc.general] CD7 Side-effect-free pointer zap Unknown
2823[expr.unary.op] CD7 Implicit undefined behavior when dereferencing pointers No
2824[dcl.init.general] CD7 Copy-initialization of arrays Unknown
2825[stmt.ranged] CD7 Range-based for statement using a braced-init-list Unknown
2826[class.temporary] drafting Missing definition of "temporary expression" Not resolved
2827[basic.fundamental] review Representation of unsigned integral types Not resolved
2828[expr.cast] CD7 Ambiguous interpretation of C-style cast Unknown
2829[over.best.ics.general] open Redundant case in restricting user-defined conversion sequences Not resolved
2830[dcl.init.list] CD7 Top-level cv-qualification should be ignored for list-initialization Unknown
2831[dcl.decl.general] CD7 Non-templated function definitions and requires-clauses Unknown
2832[class.temporary] open Invented temporary variables and temporary objects Not resolved
2833[basic.start.dynamic] review Evaluation of odr-use Not resolved
2834[temp.func.order] review Partial ordering and explicit object parameters Not resolved
2835[basic.scope.scope] open Name-independent declarations Not resolved
2836[conv.rank] CD7 Conversion rank of long double and extended floating-point types Unknown
2837[class.copy.ctor] open Instantiating and inheriting by-value copy constructors Not resolved
2838[basic.scope.block] open Declaration conflicts in lambda-expressions Not resolved
2839[class.dtor] open Explicit destruction of base classes Not resolved
2840[basic.align] open Missing requirements for fundamental alignments Not resolved
2841[class.ctor.general] open When do const objects start being const? Not resolved
2842[over.ics.rank] open Preferring an initializer_list over a single value Not resolved
2843[intro.refs] CD7 Undated reference to Unicode makes C++ a moving target Unknown
2844[over.match.oper] open Enumerating a finite set of built-in candidates Not resolved
2845[expr.prim.lambda.closure] CD7 Make the closure type of a captureless lambda a structural type Unknown
2846[dcl.fct] CD7 Out-of-class definitions of explicit object member functions Unknown
2847[temp.expl.spec] review Constrained explicit specializations of function templates at class scope @@ -16942,228 +19746,266 @@

C++ defect report implementation status

2848[temp.explicit] CD7 Omitting an empty template argument list for explicit instantiation Unknown
2849[class.temporary] CD7 Parameter objects are not temporary objects Unknown
2850[basic.stc] CD7 Unclear storage duration for function parameter objects Unknown
2851[expr.const] CD7 Allow floating-point conversions in converted constant expressions Unknown
2852[class.mem.general] open Complete-class contexts and class-scope lambdas Not resolved
2853[expr.add] CD7 Pointer arithmetic with pointer to hypothetical element Unknown
2854[except.throw] CD7 Storage duration of exception objects Unknown
2855[expr.post.incr] CD7 Undefined behavior in postfix increment Unknown
2856[over.match.list] CD7 Copy-list-initialization with explicit default constructors Unknown
2857[basic.lookup.argdep] CD7 Argument-dependent lookup with incomplete class types No
2858[expr.prim.id.qual] CD7 Declarative nested-name-specifiers and pack-index-specifiers Clang 19
2859[dcl.init.general] CD7 Value-initialization with multiple default constructors Unknown
2860[basic.life] dup Remove and fix the term "vacuous initialization" Unknown
2861[expr.dynamic.cast] CD7 dynamic_cast on bad pointer value Unknown
2862[temp.pre] review Unclear boundaries of template declarations Not resolved
2863[basic.life] drafting Unclear synchronization requirements for object lifetime rules Not resolved
2864[dcl.init.list] CD7 Narrowing floating-point conversions Unknown
2865[expr.cond] CD7 Regression on result of conditional operator Unknown
2866[dcl.attr] open Observing the effects of [[no_unique_address]] Not resolved
2867[dcl.struct.bind] CD7 Order of initialization for structured bindings Unknown
2868[class.temporary] open Self-references in trivially copyable objects as function return values Not resolved
2869[expr.prim.this] CD7 this in local classes Unknown
2870[lex.string] CD7 Combining absent encoding-prefixes Unknown
2871[class.default.ctor] CD7 User-declared constructor templates inhibiting default constructors Unknown
2872[basic.link] CD7 Linkage and unclear "can be referred to" Unknown
2873[over.over] open Taking the address of a function involving template argument deduction Not resolved
2874[dcl.type.elab] CD7 Qualified declarations of partial specializations Unknown
2875[diff.expr] tentatively ready Missing support for round-tripping null pointer values through indirection/address operators Not resolved
2876[dcl.fct.def.general] CD7 Disambiguation of T x = delete("text") Unknown
2877[enum.udecl] CD7 Type-only lookup for using-enum-declarator Clang 19
2878[expr.cast] open C-style casts to reference types Not resolved
2879[expr.const.cast] CD7 Undesired outcomes with const_cast Unknown
2880[expr.delete] CD7 Accessibility check for destructor of incomplete class type Unknown
2881[expr.prim.lambda.closure] CD7 Type restrictions for the explicit object parameter of a lambda Clang 19
2882[expr.static.cast] CD7 Unclear treatment of conversion to void Clang 2.7
2883[basic.def.odr] CD7 Definition of "odr-usable" ignores lambda scopes No
2884[dcl.type.elab] dup Qualified declarations of partial specializations Unknown
2885[class.default.ctor] review Non-eligible trivial default constructors @@ -17174,192 +20016,224 @@

C++ defect report implementation status

2886[class.temporary] CD7 Temporaries and trivial potentially-throwing special member functions Clang 9
2887[diff.cpp03.expr] CD7 Missing compatibility entries for xvalues Unknown
2888[basic.lookup.argdep] review Missing cases for reference and array types for argument-dependent lookup Not resolved
2889[expr.delete] open Requiring an accessible destructor for destroying operator delete Not resolved
2890[class.local] CD7 Defining members of local classes Unknown
2891[implimits] CD7 Normative status of implementation limits Unknown
2892[expr.arith.conv] CD7 Unclear usual arithmetic conversions Unknown
2893[temp.inst] NAD Instantiations in discarded if constexpr substatements Unknown
2894[expr.type.conv] CD7 Functional casts create prvalues of reference type Unknown
2895[dcl.init.general] CD7 Initialization should ignore the destination type's cv-qualification Unknown
2896[temp.deduct] review Template argument deduction involving exception specifications Not resolved
2897[class.copy.assign] open Copying potentially-overlapping union subobjects Not resolved
2898[over.best.ics.general] CD7 Clarify implicit conversion sequence from cv T to T Unknown
2899[conv.lval] CD7 Bad value representations should cause undefined behavior Unknown
2900[temp.deduct.type] open Deduction of non-type template arguments with placeholder types Not resolved
2901[basic.lval] CD7 Unclear semantics for near-match aliased access Unknown
2902[expr.prim.id.general] review Implicit this transformation outside of permitted contexts Not resolved
2903[temp.names] drafting Can we omit the template disambiguator in nested-name-specifiers in type-only contexts? Not resolved
2904[temp.pre] open Introducing template-names Not resolved
2905[temp.dep.constexpr] CD7 Value-dependence of noexcept-expression Unknown
2906[expr.cond] CD7 Lvalue-to-rvalue conversion of class types for conditional operator Unknown
2907[expr.const] CD7 Constant lvalue-to-rvalue conversion on uninitialized std::nullptr_t Unknown
2908[cpp.line] CD7 Counting physical source lines for __LINE__ Unknown
2909[expr.const] CD7 Subtle difference between constant-initialized and constexpr Unknown
2910[basic.def.odr] CD7 Effect of requirement-parameter-lists on odr-usability Unknown
2911[expr.prim.req.general] CD7 Unclear meaning of expressions "appearing within" subexpressions Unknown
2912[expr.new] open Too-large value for size in array new Not resolved
2913[temp.deduct.guide] CD7 Grammar for deduction-guide has requires-clause in the wrong position Clang 20
2914[basic.start.static] review Unclear order of initialization of static and thread-local variables Not resolved
2915[dcl.fct] CD7 Explicit object parameters of type void Clang 20
2916[temp.spec.partial] review Variable template partial specializations should not be declared static Not resolved
2917[temp.pre] review Disallow multiple friend-type-specifiers for a friend template @@ -17370,528 +20244,616 @@

C++ defect report implementation status

2918[over.over] CD7 Consideration of constraints for address of overloaded function Clang 21
2919[over.match.ref] CD7 Conversion function candidates for initialization of const lvalue reference Unknown
2920[temp.names] open The template keyword for base classes Not resolved
2921[module.interface] CD7 Exporting redeclarations of entities not attached to a named module Unknown
2922[expr.const] CD7 constexpr placement-new is too permissive Clang 20
2923[intro.progress] tentatively ready Note about infinite loops and execution steps Not resolved
2924[defns.undefined] CD7 Undefined behavior during constant evaluation Unknown
2925[expr.delete] NAD Deleting a pointer to an incomplete enumeration type Unknown
2926[basic.lookup.qual.general] drafting Lookup context for dependent qualified names Not resolved
2927[cpp.pre] CD7 Unclear status of translation unit with module keyword Unknown
2928[basic.start.dynamic] open No ordering for initializing thread-local variables Not resolved
2929[basic.start.term] review Lifetime of trivially-destructible static or thread-local objects Not resolved
2930[class.copy.elision] CD7 Unclear term "copy/move operation" in specification of copy elision Unknown
2931[over.oper.general] CD7 Restrictions on operator functions that are explicit object member functions Unknown
2932[dcl.enum] review Value range of empty enumeration Not resolved
2933[expr.type] CD7 Dangling references Unknown
2934[dcl.fct.def.coroutine] open Unclear semantics of exception escaping from unhandled_exception Not resolved
2935[dcl.fct.def.coroutine] open Destroying the coroutine state when initial-await-resume-called is false Not resolved
2936[temp.dep.type] CD7 Local classes of templated functions should be part of the current instantiation Unknown
2937[lex.phases] CD7 Grammar for preprocessing-file has no normative effect Unknown
2938[basic.link] open Inheriting linkage from a previous declaration Not resolved
2939[expr.reinterpret.cast] CD7 Do not allow reinterpret_cast from prvalue to rvalue reference Unknown
2940[intro.object] review Definition of "object" Not resolved
2941[class.temporary] open Lifetime extension for function-style cast to reference type Not resolved
2942[dcl.fct] open Packs in a function's parameter-type-list Not resolved
2943[dcl.attr.nodiscard] CD7 Discarding a void return value Unknown
2944[expr.throw] CD7 Unsequenced throw-expressions Unknown
2945[basic.link] open Redundant constraints on matching function template declarations Not resolved
2946[temp.over.link] open Dependent call equivalence in non-ADL cases Not resolved
2947[cpp.module] open Limiting macro expansion in pp-module Not resolved
2948[temp.spec.partial.general] open Late ambiguity for partial template specialization Not resolved
2949[temp.func.order] open Treatment of ellipsis during partial ordering Not resolved
2950[class.bit] open Value preservation in enumeration vs. integer bit-fields Not resolved
2951[temp.decls.general] open Distinguishing a primary template Not resolved
2952[basic.life] open Vacuous initialization for subobjects Not resolved
2953[basic.types.general] open Value representation for non-trivially-copyable types Not resolved
2954[intro.races] NAD Simultaneous modifications of an atomic object Unknown
2955[intro.execution] open Unify rules about conflicting unordered accesses Not resolved
2956[basic.lookup.qual.general] open Missing allowance for pseudo-destructors in qualified lookup Not resolved
2957[expr.ref] open Evaluating a reference member should constitute access Not resolved
2958[over.ics.rank] open Overload resolution involving lvalue transformation and qualification conversion Not resolved
2959[expr.ref] open Naming enumerators in class member access expressions Not resolved
2960[basic.life] open Introduce discontiguous object lifetime Not resolved
2961[temp.constr] open Checking of ill-formed types in constraint-expressions Not resolved
2962[expr.const] open Evaluation of destructor call for variable with constant destruction Not resolved
2963[stmt.ambig] open Paradoxical variable-or-function declaration Not resolved
2964[conv.lval] open Reading "invalid pointer values" Not resolved
2965[basic.scope.temp] open Generic lambdas do not have a template parameter scope Not resolved
2966[basic.fundamental] open Alignment and value representation of std::nullptr_t Not resolved
2967[over.match.ref] open Explicit conversion functions Not resolved
2968[basic.lookup.general] open Name lookup result for typedef-name vs. class-name Not resolved
2969[basic.scope] open Scopes in the function-try-block of a constructor Not resolved
2970[intro.races] CD7 Races with volatile sig_atomic_t bit-fields Unknown
2971[module.global.frag] open Specializations for a class are not decl-reachable Not resolved
2972[expr.prim.id.qual] open Declarative nested-name-specifier naming a partial specialization Not resolved
2973[dcl.typedef] open Does an alias-declaration introduce a name for linkage purposes? Not resolved
2974[temp.deduct.type] open Non-deduced context for qualified-id naming a template Not resolved
2975[temp.constr.normal] open Effect of concept template-head on parameter mappings Not resolved
2976[stmt.dcl] review Transferring control out of a function Not resolved
2977[dcl.init.general] review Initialization with string literals Not resolved
2978[temp.deduct.call] open Deduction involving reference to similar types Not resolved
2979[class.mem.general] open Duplicate declarations of enumerations in class scope Not resolved
2980[temp.names] open Constraints on template template parameters Not resolved
2981[expr.arith.conv] open Usual arithmetic conversions and result types Not resolved
2982[temp.deduct.decl] CD7 Deduction in type-constraints Unknown
2983[basic.pre] review Non-type template parameters are not variables Not resolved
2984[temp.dep.constexpr] open Value-dependent structured bindings Not resolved
2985[dcl.init.ref] CD7 Unclear rules for reference initialization with conversion Unknown
2986[basic.life] open Creating objects within a mutable member of a const object Not resolved
2987[expr.static.cast] CD7 Remove dilapidated wording from static_cast Unknown
2988[basic.link] open Is a closure type from a lambda-expression appearing in a concept-definition a TU-local entity? Not resolved
2989[expr.prim.paren] open Remove misleading general allowance for parentheses Not resolved
2990[module.interface] CD7 Exporting redeclarations of namespaces Unknown
2991[dcl.init.general] open "array size" is vague Not resolved
2992[basic.pre] open Labels do not have names Not resolved
2993[dcl.fct.def.general] open Body of a destructor Not resolved
2994[temp.param] open Allowing template parameters following template parameter packs that are pack expansions Not resolved
2995[stmt.return] open Meaning of flowing off the end of a function Not resolved
2996[temp.constr.atomic] open Impenetrable definition of atomic constraint Not resolved
2997[dcl.fct.def.default] open Defaulted functions with deleted definition Not resolved
2998[temp.deduct.partial] open Missing deduction consistency check for partial ordering Not resolved
2999[class.default.ctor] open Trivial unions changing existing behavior Not resolved
3000[expr.cond] review Handling of cv-qualified class types in conditional operator Not resolved
3001[basic.life] tentatively ready Inconsistent restrictions for static_cast on pointers to out-of-lifetime objects Not resolved
3002[temp.dep.temp] tentatively ready Template parameter/argument confusion Not resolved
3003[dcl.type.simple] review Naming a deducible template for class template argument deduction Not resolved
3004[expr.const] tentatively ready Pointer arithmetic on array of unknown bound Not resolved
3005[basic.scope.scope] tentatively ready Function parameters should never be name-independent @@ -17902,570 +20864,665 @@

C++ defect report implementation status

3006[temp.explicit] review Vague restrictions for explicit instantiations of class templates Not resolved
3007[class.compare.default] open Access checking during synthesis of defaulted comparison operator, take 2 Not resolved
3008[diff.dcl] tentatively ready Missing Annex C entry for void object declarations Not resolved
3009[expr.const] open Unclear rules for constant initialization Not resolved
3010[expr.const] open constexpr placement-new should require transparent replaceability Not resolved
3011[expr.new] tentatively ready Parenthesized aggregate initialization for new-expressions Not resolved
3012[dcl.constexpr] open Deviating constexpr or consteval across translation units Not resolved
3013[cpp.embed.gen] CD7 Disallowing macros for #embed parameters Unknown
3014[cpp.embed.gen] CD7 Comma-delimited vs. comma-separated output for #embed Unknown
3015[cpp.include] CD7 Handling of header-names for #include and #embed Unknown
3016[cpp.cond] CD7 Satisfying the syntactic requirements of #include and #embed Unknown
3017[cpp.cond] open Commas in controlling expression of conditional inclusion Not resolved
3018[cpp.cond] CD7 Validity of defined in __has_embed Unknown
3019[lex.header] open Restrictions on character sequences in header-names Not resolved
3020[cpp.cond] CD7 Missing specification for __has_cpp_attribute(indeterminate) Unknown
3021[temp.constr.order] drafting Subsumption rules for fold expanded constraints Not resolved
3022[class.dtor] review Redundant specification of explicit destructor calls Not resolved
3023[dcl.init.list] open Default arguments in list-initialization Not resolved
3024[dcl.align] open Alignment of references Not resolved
3025[basic.stc.dynamic.deallocation] open Deallocation functions returning void Not resolved
3026[expr.unary.op] open Class for pointer-to-member formation Not resolved
3027[temp.type] open Equivalence of pack-index-specifiers Not resolved
3028[namespace.udecl] open A using-declarator should bind a name Not resolved
3029[basic.align] drafting Confusing note about ordinary character types for aligned memory areas Not resolved
3030[dcl.array] open Initializing array prvalues of unknown bound Not resolved
3031[over.match.funcs.general] open Finding declarations for conversion operators for access checking Not resolved
3032[temp.arg.general] tentatively ready Template argument disambiguation Not resolved
3033[basic.scope.namespace] open Scope after declarator-id before determining correspondence Not resolved
3034[temp.inst] open Infinite recursion should hit an implementation limit Not resolved
3035[class.union.anon] open Lambda expressions in anonymous unions Not resolved
3036[basic.extended.fp] open Extended floating-point types should not be cv-qualified Not resolved
3037[namespace.udecl] open Name lookup results for using-declarators Not resolved
3038[dcl.attr.grammar] open Ignorability of attributes, again Not resolved
3039[intro.object] open Undefined behavior from implicit object creation ignores observable checkpoints Not resolved
3040[dcl.fct.def.coroutine] open Mishandling of lambda coroutines Not resolved
3041[class.dtor] open Overly aggressive rule for deleting the destructor of a union Not resolved
3042[basic.lval] open Implicit object creation is insufficient to model effective type rule of C Not resolved
3043[class.temporary] open Lifetime extension for temporaries in expansion statements Not resolved
3044[stmt.expand] tentatively ready Iterating expansion statements woes Not resolved
3045[basic.scope.block] tentatively ready Regularizing environment interactions of expansion statement Not resolved
3046[dcl.enum] open Enumerations as part of the common initial sequence Not resolved
3047[basic.life] open Calling destructors on out-of-lifetime objects Not resolved
3048[stmt.expand] tentatively ready Empty destructuring expansion statements Not resolved
3049[class.prop] open Implicitly deleted move operation should not disable trivial relocation Not resolved
3050[dcl.attr.deprecated] open [[deprecated]] for class template partial specializations Not resolved
3051[class.mem.general] open Missing specification for types of member subobjects Not resolved
3052[stmt.return] open Unclear handling of checks on discarded return statements Not resolved
3053[cpp.replace.general] tentatively ready Allowing #undef likely Not resolved
3054[expr.call] open Use of default arguments depending on shape of postfix-expression in a function call Not resolved
3055[over.call.object] open Misleading body for surrogate call function Not resolved
3056[expr.prim.req.type] open Missing semicolons in grammar for type-requirement Not resolved
3057[over.ics.ref] open Ranking of derived-to-base conversions should ignore reference binding Not resolved
3058[basic.lookup.general] open "Program point" is not defined Not resolved
3059[expr.const] open throw; in constant expressions Not resolved
3060[basic.start.main] open Change in behavior for noexcept main Not resolved
3061[stmt.expand] tentatively ready Trailing comma in an expansion-init-list Not resolved
3062[dcl.fct.default] open Overlapping specification of default template arguments Not resolved
3063[class.temporary] open Lifetime extension of temporaries past function return Not resolved
3064[basic.life] open Mishandling of placement-new in lifetime rules Not resolved
3065[basic.types.general] open Reachability and completeness of types Not resolved
3066[expr.prim.id.qual] tentatively ready Declarative nested-name-specifier in explicit instantiation Not resolved
3067[conv.array] open Array-to-pointer conversion with object type mismatch Not resolved
3068[class.access.general] open Access checking in friends involving qualified-ids Not resolved
3069[temp.constr.normal] open Reference to wrong placeholder Not resolved
3070[class.copy.assign] open Trivial assignment can skip member subobjects Not resolved
3071[dcl.struct.bind] open Negative tuple_size in structured bindings Not resolved
3072[temp.deduct.general] open Incorrect examples for lambda SFINAE Not resolved
3073[over.match.ref] open Dependence of R on T2 is unclear Not resolved
3074[cpp.module] tentatively ready Redundant ill-formedness for module macros Not resolved
3075[cpp.import] tentatively ready Unclear matching of import directive Not resolved
3076[cpp.include] tentatively ready Remove unnecessary IFNDR for malformed header-name-tokens Not resolved
3077[cpp.pre] tentatively ready Undesirable formation of import directive with string-literal Not resolved
3078[cpp.include] review Different treatment of #include pp-tokens and header-name-tokens Not resolved
3079[class.union.anon] open Allow empty-declarations in anonymous unions Not resolved
3080[temp.arg.template] tentatively ready Clarify kinds of permitted template template arguments Not resolved
3081[expr.ref] review Require glvalue when splicing direct base class relationship Not resolved
3082[expr.reinterpret.cast] tentatively ready Allow for call-compatible function types in reinterpret_cast Not resolved
3083[stmt.pre] tentatively ready Remove redundant restrictions on class and enum definitions Not resolved
3084[stmt.cont] tentatively ready compound-statements inside iteration-statements Not resolved
3085[stmt.pre] tentatively ready Apply restriction inside for-range-declaration Not resolved
3086[cpp.pragma.op] tentatively ready Destringizing should consider all sorts of encoding-prefixes Not resolved
3087[cpp.pragma.op] open Destringizing for raw string literals Not resolved
3088[cpp.replace.general] open Clarify macro treatment of identifiers with special meaning Not resolved
3089[dcl.init.general] tentatively ready const-default-constructible improperly handles std::meta::info Not resolved
3090[module.interface] tentatively ready Internal linkage from header units Not resolved
3091[basic.link] review Linking of translation units as sequences of tokens Not resolved
3092[dcl.attr.annotation] tentatively ready base-specifiers are not "declared" Not resolved
3093[expr.prim.splice] open Missing integration of direct base class relationships Not resolved
3094[lex.phases] review Rework phases for string literal concatenation and token formation Not resolved
3095[temp.dep.expr] open Type-dependent packs that are not structured binding packs Not resolved
3096[temp.dep.constexpr] open Value-dependence of size of structured binding pack with non-dependent initializer Not resolved
3097[basic.scope.scope] tentatively ready Lambda expression introduces a scope Not resolved
3098[temp.names] tentatively ready Remove redundancy "names or designates" Not resolved
3099[temp.inst] open Instantiation of type aliases from alias templates is unspecified Not resolved
3100[basic.start.term] open Destruction order for objects with static storage duration Not resolved